From cf60682c8290f1191c5d3e4609a8ad3b8d1b162a Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 001/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 1fd1de1530328321d28aa6d9db85fffeb876574c Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 002/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From bd2d47e7e53e1241c27aed0f823fa680d8ecf4e2 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 003/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 2459ecbc53fdb7c9c449930700cff290de15c152 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 004/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From 6d5049dd1c6bb3e201c4ca807da6950e0ab4b9d2 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 005/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 9dfbdc5cffe708b084e7367d6df2910908f5e76a Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 006/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From a8857ae326b2d71ca66bc6f86715ac9ab467ac85 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 007/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 5a38b266ac77a2bf0766cefab14ec62f28633a8d Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 008/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From 9ccf540b892c0ae3a58a004afcb01d3647a92c90 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 24 Apr 2025 20:01:07 +0700
Subject: [PATCH 009/226] DocSum - refactoring README.md
Signed-off-by: Chingis Yundunov
---
DocSum/docker_compose/amd/gpu/rocm/README.md | 138 +++++++++++++++----
1 file changed, 108 insertions(+), 30 deletions(-)
diff --git a/DocSum/docker_compose/amd/gpu/rocm/README.md b/DocSum/docker_compose/amd/gpu/rocm/README.md
index 2c4a196149..92922f4b65 100644
--- a/DocSum/docker_compose/amd/gpu/rocm/README.md
+++ b/DocSum/docker_compose/amd/gpu/rocm/README.md
@@ -25,15 +25,15 @@ This section describes how to quickly deploy and test the DocSum service manuall
Clone the GenAIExample repository and access the ChatQnA AMD GPU platform Docker Compose files and supporting scripts:
-```
+```bash
git clone https://github.com/opea-project/GenAIExamples.git
cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
```
-Checkout a released version, such as v1.2:
+Checkout a released version, such as v1.3:
```
-git checkout v1.2
+git checkout v1.3
```
### Generate a HuggingFace Access Token
@@ -42,33 +42,96 @@ Some HuggingFace resources, such as some models, are only accessible if you have
### Configure the Deployment Environment
-To set up environment variables for deploying DocSum services, source the _set_env.sh_ script in this directory:
+To set up environment variables for deploying ChatQnA services, set up some parameters specific to the deployment environment and source the `set_env_*.sh` script in this directory:
-```
-source ./set_env.sh
+- if used vLLM - set_env_vllm.sh
+- if used TGI - set_env.sh
+
+Set the values of the variables:
+
+- **HOST_IP, HOST_IP_EXTERNAL** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
+
+ If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
+
+ If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
+
+ If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the HOST_IP variable will have a value equal to the internal name/address of the server, and the EXTERNAL_HOST_IP variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
+
+ We set these values in the file set_env\*\*\*\*.sh
+
+- **Variables with names like "**\*\*\*\*\*\*\_PORT"\*\* - These variables set the IP port numbers for establishing network connections to the application services.
+ The values shown in the file set_env.sh or set_env_vllm.sh they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
+
+Setting variables in the operating system environment:
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN="Your_HuggingFace_API_Token"
+source ./set_env_*.sh # replace the script name with the appropriate one
```
-The _set_env.sh_ script will prompt for required and optional environment variables used to configure the DocSum services. If a value is not entered, the script will use a default value for the same. It will also generate a _.env_ file defining the desired configuration. Consult the section on [DocSum Service configuration](#docsum-service-configuration) for information on how service specific configuration parameters affect deployments.
+Consult the section on [DocSum Service configuration](#docsum-configuration) for information on how service specific configuration parameters affect deployments.
### Deploy the Services Using Docker Compose
-To deploy the DocSum services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute:
+To deploy the DocSum services, execute the `docker compose up` command with the appropriate arguments. For a default deployment with TGI, execute the command below. It uses the 'compose.yaml' file.
```bash
-docker compose up -d
+cd docker_compose/amd/gpu/rocm
+# if used TGI
+docker compose -f compose.yaml up -d
+# if used vLLM
+# docker compose -f compose_vllm.yaml up -d
+```
+
+To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose file:
+
+- compose_vllm.yaml - for vLLM-based application
+- compose.yaml - for TGI-based
+
+```yaml
+shm_size: 1g
+devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri:/dev/dri
+cap_add:
+ - SYS_PTRACE
+group_add:
+ - video
+security_opt:
+ - seccomp:unconfined
+```
+
+This configuration forwards all available GPUs to the container. To use a specific GPU, specify its `cardN` and `renderN` device IDs. For example:
+
+```yaml
+shm_size: 1g
+devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/render128:/dev/dri/render128
+cap_add:
+ - SYS_PTRACE
+group_add:
+ - video
+security_opt:
+ - seccomp:unconfined
```
-**Note**: developers should build docker image from source when:
+**How to Identify GPU Device IDs:**
+Use AMD GPU driver utilities to determine the correct `cardN` and `renderN` IDs for your GPU.
-- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
-- Unable to download the docker image.
-- Use a specific version of Docker image.
+> **Note**: developers should build docker image from source when:
+>
+> - Developing off the git main branch (as the container's ports in the repo may be different > from the published docker image).
+> - Unable to download the docker image.
+> - Use a specific version of Docker image.
Please refer to the table below to build different microservices from source:
| Microservice | Deployment Guide |
-| ------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
+|--------------| ------------------------------------------------------------------------------------------------------------------------------------- |
| whisper | [whisper build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/whisper/src) |
+| TGI | [TGI project](https://github.com/huggingface/text-generation-inference.git) |
| vLLM | [vLLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/vllm#build-docker) |
| llm-docsum | [LLM-DocSum build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/doc-summarization#12-build-docker-image) |
| MegaService | [MegaService build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image) |
@@ -84,6 +147,7 @@ docker ps -a
For the default deployment, the following 5 containers should have started:
+If used TGI:
```
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
748f577b3c78 opea/whisper:latest "python whisper_s…" 5 minutes ago Up About a minute 0.0.0.0:7066->7066/tcp, :::7066->7066/tcp whisper-service
@@ -93,24 +157,38 @@ fds3dd5b9fd8 opea/docsum:latest "py
78964d0c1hg5 ghcr.io/huggingface/text-generation-inference:2.4.1-rocm "/tgi-entrypoint.sh" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:8008->80/tcp, [::]:8008->80/tcp docsum-tgi-service
```
+If used vLLM:
+```
+CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
+748f577b3c78 opea/whisper:latest "python whisper_s…" 5 minutes ago Up About a minute 0.0.0.0:7066->7066/tcp, :::7066->7066/tcp whisper-service
+4eq8b7034fd9 opea/docsum-gradio-ui:latest "docker-entrypoint.s…" 5 minutes ago Up About a minute 0.0.0.0:5173->5173/tcp, :::5173->5173/tcp docsum-ui-server
+fds3dd5b9fd8 opea/docsum:latest "python docsum.py" 5 minutes ago Up About a minute 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp docsum-backend-server
+78fsd6fabfs7 opea/llm-docsum:latest "bash entrypoint.sh" 5 minutes ago Up About a minute 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp docsum-llm-server
+78964d0c1hg5 opea/vllm-rocm:latest "python3 /workspace/…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:8008->80/tcp, [::]:8008->80/tcp docsum-vllm-service
+```
+
### Test the Pipeline
Once the DocSum services are running, test the pipeline using the following command:
```bash
-curl -X POST http://${host_ip}:8888/v1/docsum \
+curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: application/json" \
-d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
```
-**Note** The value of _host_ip_ was set using the _set_env.sh_ script and can be found in the _.env_ file.
+**Note** The value of _HOST_IP_ was set using the _set_env.sh_ script and can be found in the _.env_ file.
### Cleanup the Deployment
To stop the containers associated with the deployment, execute the following command:
-```
+```bash
+# if used TGI
docker compose -f compose.yaml down
+# if used vLLM
+# docker compose -f compose_vllm.yaml down
+
```
All the DocSum containers will be stopped and then removed on completion of the "down" command.
@@ -132,7 +210,7 @@ There are also some customized usage.
```bash
# form input. Use English mode (default).
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
@@ -141,7 +219,7 @@ curl http://${host_ip}:8888/v1/docsum \
-F "stream=True"
# Use Chinese mode.
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。" \
@@ -150,7 +228,7 @@ curl http://${host_ip}:8888/v1/docsum \
-F "stream=True"
# Upload file
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
@@ -166,11 +244,11 @@ curl http://${host_ip}:8888/v1/docsum \
Audio:
```bash
-curl -X POST http://${host_ip}:8888/v1/docsum \
+curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: application/json" \
-d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=audio" \
-F "messages=UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" \
@@ -182,11 +260,11 @@ curl http://${host_ip}:8888/v1/docsum \
Video:
```bash
-curl -X POST http://${host_ip}:8888/v1/docsum \
+curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: application/json" \
-d '{"type": "video", "messages": "convert your video to base64 data type"}'
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=video" \
-F "messages=convert your video to base64 data type" \
@@ -208,7 +286,7 @@ If you want to deal with long context, can set following parameters and select s
"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
```bash
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
@@ -223,7 +301,7 @@ curl http://${host_ip}:8888/v1/docsum \
In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
```bash
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
@@ -238,7 +316,7 @@ curl http://${host_ip}:8888/v1/docsum \
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
@@ -255,7 +333,7 @@ Map_reduce mode will split the inputs into multiple chunks, map each document to
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
@@ -272,7 +350,7 @@ Refin mode will split the inputs into multiple chunks, generate summary for the
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
```bash
-curl http://${host_ip}:8888/v1/docsum \
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
@@ -288,7 +366,7 @@ Several UI options are provided. If you need to work with multimedia documents,
### Gradio UI
-To access the UI, use the URL - http://${EXTERNAL_HOST_IP}:${FAGGEN_UI_PORT}
+To access the UI, use the URL - http://${HOST_IP}:${DOCSUM_FRONTEND_PORT}
A page should open when you click through to this address:

From b5df3482351bc2a00f262f8449ea410ef0e817ab Mon Sep 17 00:00:00 2001
From: xiguiw <111278656+xiguiw@users.noreply.github.com>
Date: Wed, 19 Feb 2025 19:24:10 +0800
Subject: [PATCH 010/226] Fix mismatched environment variable (#1575)
Signed-off-by: Wang, Xigui
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh b/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
index 4eda65f97a..3ee4cd6d6c 100644
--- a/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
@@ -17,7 +17,7 @@ if [ -z "${host_ip}" ]; then
echo "Error: host_ip is not set. Please set host_ip first."
fi
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
export INDEX_NAME="rag-redis"
From 60dd862d5ac44e6e7b1b41595e02b727ff8b8244 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Thu, 20 Feb 2025 14:41:52 +0800
Subject: [PATCH 011/226] Fix trivy issue (#1569)
Fix docker image security issue
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
AvatarChatbot/Dockerfile | 2 +-
ChatQnA/tests/test_compose_on_gaudi.sh | 3 +--
EdgeCraftRAG/ui/docker/Dockerfile.ui | 3 ++-
VideoQnA/ui/docker/Dockerfile | 2 +-
4 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/AvatarChatbot/Dockerfile b/AvatarChatbot/Dockerfile
index 3266bc296a..f0fa5744e7 100644
--- a/AvatarChatbot/Dockerfile
+++ b/AvatarChatbot/Dockerfile
@@ -32,7 +32,7 @@ COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip && \
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
WORKDIR $HOME
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index 59ffbb3ded..2785995bbb 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -2,7 +2,7 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-set -xe
+set -e
IMAGE_REPO=${IMAGE_REPO:-"opea"}
IMAGE_TAG=${IMAGE_TAG:-"latest"}
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
@@ -47,7 +47,6 @@ function start_services() {
export NUM_CARDS=1
export INDEX_NAME="rag-redis"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
- export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export host_ip=${ip_address}
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
diff --git a/EdgeCraftRAG/ui/docker/Dockerfile.ui b/EdgeCraftRAG/ui/docker/Dockerfile.ui
index 3dacb35d8d..8abffc5557 100644
--- a/EdgeCraftRAG/ui/docker/Dockerfile.ui
+++ b/EdgeCraftRAG/ui/docker/Dockerfile.ui
@@ -15,7 +15,8 @@ RUN mkdir -p /home/user/gradio_cache
ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
WORKDIR /home/user/ui
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+ pip install --no-cache-dir -r requirements.txt
USER user
diff --git a/VideoQnA/ui/docker/Dockerfile b/VideoQnA/ui/docker/Dockerfile
index dcd029a0b8..019999de8a 100644
--- a/VideoQnA/ui/docker/Dockerfile
+++ b/VideoQnA/ui/docker/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y curl && \
rm -rf /var/lib/apt/lists/*
-RUN pip install --no-cache-dir --upgrade pip && \
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir streamlit
COPY ui.py /app/ui.py
From 06d31cc67426fe4501cc743ebc9b687b21af257d Mon Sep 17 00:00:00 2001
From: minmin-intel
Date: Fri, 21 Feb 2025 17:51:26 -0800
Subject: [PATCH 012/226] Update AgentQnA and DocIndexRetriever (#1564)
Signed-off-by: minmin-intel
Signed-off-by: Chingis Yundunov
---
AgentQnA/README.md | 43 ++++------
.../intel/cpu/xeon/compose_openai.yaml | 11 ++-
.../cpu/xeon/launch_agent_service_openai.sh | 4 +-
.../intel/hpu/gaudi/compose.yaml | 5 +-
.../hpu/gaudi/launch_agent_service_gaudi.sh | 2 +-
... step4_launch_and_validate_agent_gaudi.sh} | 42 ++--------
AgentQnA/tests/test.py | 79 ++++++++++++-------
AgentQnA/tests/test_compose_on_gaudi.sh | 2 +-
.../intel/cpu/xeon/compose.yaml | 6 ++
DocIndexRetriever/retrieval_tool.py | 56 ++++++++-----
DocIndexRetriever/tests/test.py | 38 +++++++++
11 files changed, 170 insertions(+), 118 deletions(-)
rename AgentQnA/tests/{step4_launch_and_validate_agent_tgi.sh => step4_launch_and_validate_agent_gaudi.sh} (87%)
create mode 100644 DocIndexRetriever/tests/test.py
diff --git a/AgentQnA/README.md b/AgentQnA/README.md
index d45b14ef55..397bd0c775 100644
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -84,7 +84,7 @@ flowchart LR
3. Hierarchical multi-agents can improve performance.
Expert worker agents, such as RAG agent and SQL agent, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. If we only use one agent and provide all the tools to this single agent, it may get overwhelmed and not able to provide accurate answers.
-## Deployment with docker
+## Deploy with docker
1. Build agent docker image [Optional]
@@ -217,13 +217,19 @@ docker build -t opea/agent:latest --build-arg https_proxy=$https_proxy --build-a
:::
::::
+## Deploy AgentQnA UI
+
+The AgentQnA UI can be deployed locally or using Docker.
+
+For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
+
## Deploy using Helm Chart
Refer to the [AgentQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AgentQnA on Kubernetes.
## Validate services
-First look at logs of the agent docker containers:
+1. First look at logs of the agent docker containers:
```
# worker RAG agent
@@ -240,35 +246,18 @@ docker logs react-agent-endpoint
You should see something like "HTTP server setup successful" if the docker containers are started successfully.
-Second, validate worker RAG agent:
+2. You can use python to validate the agent system
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
- "messages": "Michael Jackson song Thriller"
- }'
-```
+```bash
+# RAG worker agent
+python tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
-Third, validate worker SQL agent:
+# SQL agent
+python tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096
+# supervisor agent: this will test a two-turn conversation
+python tests/test.py --agent_role "supervisor" --ext_port 9090
```
-curl http://${host_ip}:9096/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
- "messages": "How many employees are in the company"
- }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
- "messages": "How many albums does Iron Maiden have?"
- }'
-```
-
-## Deploy AgentQnA UI
-
-The AgentQnA UI can be deployed locally or using Docker.
-
-For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
## How to register your own tools with agent
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
index 09bde26bde..bbd64ceb30 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -13,6 +13,7 @@ services:
environment:
ip_address: ${ip_address}
strategy: rag_agent
+ with_memory: false
recursion_limit: ${recursion_limit_worker}
llm_engine: openai
OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -35,17 +36,17 @@ services:
image: opea/agent:latest
container_name: sql-agent-endpoint
volumes:
- - ${WORKDIR}/TAG-Bench/:/home/user/TAG-Bench # SQL database
+ - ${WORKDIR}/GenAIExamples/AgentQnA/tests:/home/user/chinook-db # SQL database
ports:
- "9096:9096"
ipc: host
environment:
ip_address: ${ip_address}
strategy: sql_agent
+ with_memory: false
db_name: ${db_name}
db_path: ${db_path}
use_hints: false
- hints_file: /home/user/TAG-Bench/${db_name}_hints.csv
recursion_limit: ${recursion_limit_worker}
llm_engine: openai
OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -64,6 +65,7 @@ services:
container_name: react-agent-endpoint
depends_on:
- worker-rag-agent
+ - worker-sql-agent
volumes:
- ${TOOLSET_PATH}:/home/user/tools/
ports:
@@ -71,14 +73,15 @@ services:
ipc: host
environment:
ip_address: ${ip_address}
- strategy: react_langgraph
+ strategy: react_llama
+ with_memory: true
recursion_limit: ${recursion_limit_supervisor}
llm_engine: openai
OPENAI_API_KEY: ${OPENAI_API_KEY}
model: ${model}
temperature: ${temperature}
max_new_tokens: ${max_new_tokens}
- stream: false
+ stream: true
tools: /home/user/tools/supervisor_agent_tools.yaml
require_human_feedback: false
no_proxy: ${no_proxy}
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
index 7b4e86a781..2455865f27 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
@@ -16,7 +16,7 @@ export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
export CRAG_SERVER=http://${ip_address}:8080
-export db_name=california_schools
-export db_path="sqlite:////home/user/TAG-Bench/dev_folder/dev_databases/${db_name}/${db_name}.sqlite"
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
docker compose -f compose_openai.yaml up -d
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 4895722c93..c14d58c10b 100644
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -13,6 +13,7 @@ services:
environment:
ip_address: ${ip_address}
strategy: rag_agent_llama
+ with_memory: false
recursion_limit: ${recursion_limit_worker}
llm_engine: vllm
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -43,6 +44,7 @@ services:
environment:
ip_address: ${ip_address}
strategy: sql_agent_llama
+ with_memory: false
db_name: ${db_name}
db_path: ${db_path}
use_hints: false
@@ -74,6 +76,7 @@ services:
environment:
ip_address: ${ip_address}
strategy: react_llama
+ with_memory: true
recursion_limit: ${recursion_limit_supervisor}
llm_engine: vllm
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -81,7 +84,7 @@ services:
model: ${LLM_MODEL_ID}
temperature: ${temperature}
max_new_tokens: ${max_new_tokens}
- stream: false
+ stream: true
tools: /home/user/tools/supervisor_agent_tools.yaml
require_human_feedback: false
no_proxy: ${no_proxy}
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
index fff5d53f8d..298feee3fd 100644
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
@@ -14,7 +14,7 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export HF_CACHE_DIR=${HF_CACHE_DIR}
ls $HF_CACHE_DIR
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
+export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
export NUM_SHARDS=4
export LLM_ENDPOINT_URL="http://${ip_address}:8086"
export temperature=0
diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
similarity index 87%
rename from AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
rename to AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
index 824f7aa855..56f017239b 100644
--- a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -11,7 +11,7 @@ export ip_address=$(hostname -I | awk '{print $1}')
export TOOLSET_PATH=$WORKPATH/tools/
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
export HF_CACHE_DIR=/data2/huggingface
if [ ! -d "$HF_CACHE_DIR" ]; then
@@ -60,23 +60,6 @@ function start_vllm_service_70B() {
echo "Service started successfully"
}
-
-function prepare_data() {
- cd $WORKDIR
-
- echo "Downloading data..."
- git clone https://github.com/TAG-Research/TAG-Bench.git
- cd TAG-Bench/setup
- chmod +x get_dbs.sh
- ./get_dbs.sh
-
- echo "Split data..."
- cd $WORKPATH/tests/sql_agent_test
- bash run_data_split.sh
-
- echo "Data preparation done!"
-}
-
function download_chinook_data(){
echo "Downloading chinook data..."
cd $WORKDIR
@@ -113,7 +96,7 @@ function validate_agent_service() {
echo "======================Testing worker rag agent======================"
export agent_port="9095"
prompt="Tell me about Michael Jackson song Thriller"
- local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+ local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
# echo $CONTENT
local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
echo $EXIT_CODE
@@ -127,7 +110,7 @@ function validate_agent_service() {
echo "======================Testing worker sql agent======================"
export agent_port="9096"
prompt="How many employees are there in the company?"
- local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+ local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
echo $CONTENT
# echo $EXIT_CODE
@@ -140,9 +123,8 @@ function validate_agent_service() {
# test supervisor react agent
echo "======================Testing supervisor react agent======================"
export agent_port="9090"
- prompt="How many albums does Iron Maiden have?"
- local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
- local EXIT_CODE=$(validate "$CONTENT" "21" "react-agent-endpoint")
+ local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
+ local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
# echo $CONTENT
echo $EXIT_CODE
local EXIT_CODE="${EXIT_CODE:0-1}"
@@ -153,15 +135,6 @@ function validate_agent_service() {
}
-function remove_data() {
- echo "Removing data..."
- cd $WORKDIR
- if [ -d "TAG-Bench" ]; then
- rm -rf TAG-Bench
- fi
- echo "Data removed!"
-}
-
function remove_chinook_data(){
echo "Removing chinook data..."
cd $WORKDIR
@@ -189,8 +162,9 @@ function main() {
echo "==================== Agent service validated ===================="
}
-remove_data
+
remove_chinook_data
+
main
-remove_data
+
remove_chinook_data
diff --git a/AgentQnA/tests/test.py b/AgentQnA/tests/test.py
index 400684ffd6..18254f16c5 100644
--- a/AgentQnA/tests/test.py
+++ b/AgentQnA/tests/test.py
@@ -1,34 +1,20 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import argparse
-import os
+import json
+import uuid
import requests
-def generate_answer_agent_api(url, prompt):
- proxies = {"http": ""}
- payload = {
- "messages": prompt,
- }
- response = requests.post(url, json=payload, proxies=proxies)
- answer = response.json()["text"]
- return answer
-
-
def process_request(url, query, is_stream=False):
proxies = {"http": ""}
-
- payload = {
- "messages": query,
- }
-
+ content = json.dumps(query) if query is not None else None
try:
- resp = requests.post(url=url, json=payload, proxies=proxies, stream=is_stream)
+ resp = requests.post(url=url, data=content, proxies=proxies, stream=is_stream)
if not is_stream:
ret = resp.json()["text"]
- print(ret)
else:
for line in resp.iter_lines(decode_unicode=True):
print(line)
@@ -38,19 +24,54 @@ def process_request(url, query, is_stream=False):
return ret
except requests.exceptions.RequestException as e:
ret = f"An error occurred:{e}"
- print(ret)
- return False
+ return None
+
+
+def test_worker_agent(args):
+ url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+ query = {"role": "user", "messages": args.prompt, "stream": "false"}
+ ret = process_request(url, query)
+ print("Response: ", ret)
+
+
+def add_message_and_run(url, user_message, thread_id, stream=False):
+ print("User message: ", user_message)
+ query = {"role": "user", "messages": user_message, "thread_id": thread_id, "stream": stream}
+ ret = process_request(url, query, is_stream=stream)
+ print("Response: ", ret)
+
+
+def test_chat_completion_multi_turn(args):
+ url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+ thread_id = f"{uuid.uuid4()}"
+
+ # first turn
+ print("===============First turn==================")
+ user_message = "Which artist has the most albums in the database?"
+ add_message_and_run(url, user_message, thread_id, stream=args.stream)
+ print("===============End of first turn==================")
+
+ # second turn
+ print("===============Second turn==================")
+ user_message = "Give me a few examples of the artist's albums?"
+ add_message_and_run(url, user_message, thread_id, stream=args.stream)
+ print("===============End of second turn==================")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("--prompt", type=str)
- parser.add_argument("--stream", action="store_true")
- args = parser.parse_args()
+ parser.add_argument("--ip_addr", type=str, default="127.0.0.1", help="endpoint ip address")
+ parser.add_argument("--ext_port", type=str, default="9090", help="endpoint port")
+ parser.add_argument("--stream", action="store_true", help="streaming mode")
+ parser.add_argument("--prompt", type=str, help="prompt message")
+ parser.add_argument("--agent_role", type=str, default="supervisor", help="supervisor or worker")
+ args, _ = parser.parse_known_args()
- ip_address = os.getenv("ip_address", "localhost")
- agent_port = os.getenv("agent_port", "9090")
- url = f"http://{ip_address}:{agent_port}/v1/chat/completions"
- prompt = args.prompt
+ print(args)
- process_request(url, prompt, args.stream)
+ if args.agent_role == "supervisor":
+ test_chat_completion_multi_turn(args)
+ elif args.agent_role == "worker":
+ test_worker_agent(args)
+ else:
+ raise ValueError("Invalid agent role")
diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh
index de70514ba6..ab0ce295cb 100644
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -78,7 +78,7 @@ bash step3_ingest_data_and_validate_retrieval.sh
echo "=================== #3 Data ingestion and validation completed===================="
echo "=================== #4 Start agent and API server===================="
-bash step4_launch_and_validate_agent_tgi.sh
+bash step4_launch_and_validate_agent_gaudi.sh
echo "=================== #4 Agent test passed ===================="
echo "=================== #5 Stop agent and API server===================="
diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
index d4bfe0446f..9624df7300 100644
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
@@ -13,6 +13,8 @@ services:
dataprep-redis-service:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-redis-server
+ # volumes:
+ # - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/user/comps
depends_on:
- redis-vector-db
ports:
@@ -52,6 +54,8 @@ services:
embedding:
image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
container_name: embedding-server
+ # volumes:
+ # - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/comps
ports:
- "6000:6000"
ipc: host
@@ -110,6 +114,8 @@ services:
reranking:
image: ${REGISTRY:-opea}/reranking:${TAG:-latest}
container_name: reranking-tei-xeon-server
+ # volumes:
+ # - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/user/comps
depends_on:
tei-reranking-service:
condition: service_healthy
diff --git a/DocIndexRetriever/retrieval_tool.py b/DocIndexRetriever/retrieval_tool.py
index b627f45537..99fab7b1b5 100644
--- a/DocIndexRetriever/retrieval_tool.py
+++ b/DocIndexRetriever/retrieval_tool.py
@@ -22,16 +22,38 @@
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
- if self.services[cur_node].service_type == ServiceType.EMBEDDING:
- inputs["input"] = inputs["text"]
- del inputs["text"]
+ print(f"Inputs to {cur_node}: {inputs}")
+ for key, value in kwargs.items():
+ print(f"{key}: {value}")
return inputs
def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_dict, **kwargs):
next_data = {}
if self.services[cur_node].service_type == ServiceType.EMBEDDING:
- next_data = {"text": inputs["input"], "embedding": [item["embedding"] for item in data["data"]]}
+ # turn into chat completion request
+ # next_data = {"text": inputs["input"], "embedding": [item["embedding"] for item in data["data"]]}
+ print("Assembing output from Embedding for next node...")
+ print("Inputs to Embedding: ", inputs)
+ print("Keyword arguments: ")
+ for key, value in kwargs.items():
+ print(f"{key}: {value}")
+
+ next_data = {
+ "input": inputs["input"],
+ "messages": inputs["input"],
+ "embedding": data, # [item["embedding"] for item in data["data"]],
+ "k": kwargs["k"] if "k" in kwargs else 4,
+ "search_type": kwargs["search_type"] if "search_type" in kwargs else "similarity",
+ "distance_threshold": kwargs["distance_threshold"] if "distance_threshold" in kwargs else None,
+ "fetch_k": kwargs["fetch_k"] if "fetch_k" in kwargs else 20,
+ "lambda_mult": kwargs["lambda_mult"] if "lambda_mult" in kwargs else 0.5,
+ "score_threshold": kwargs["score_threshold"] if "score_threshold" in kwargs else 0.2,
+ "top_n": kwargs["top_n"] if "top_n" in kwargs else 1,
+ }
+
+ print("Output from Embedding for next node:\n", next_data)
+
else:
next_data = data
@@ -99,18 +121,6 @@ def parser_input(data, TypeClass, key):
raise ValueError(f"Unknown request type: {data}")
if isinstance(chat_request, ChatCompletionRequest):
- retriever_parameters = RetrieverParms(
- search_type=chat_request.search_type if chat_request.search_type else "similarity",
- k=chat_request.k if chat_request.k else 4,
- distance_threshold=chat_request.distance_threshold if chat_request.distance_threshold else None,
- fetch_k=chat_request.fetch_k if chat_request.fetch_k else 20,
- lambda_mult=chat_request.lambda_mult if chat_request.lambda_mult else 0.5,
- score_threshold=chat_request.score_threshold if chat_request.score_threshold else 0.2,
- )
- reranker_parameters = RerankerParms(
- top_n=chat_request.top_n if chat_request.top_n else 1,
- )
-
initial_inputs = {
"messages": query,
"input": query, # has to be input due to embedding expects either input or text
@@ -123,13 +133,21 @@ def parser_input(data, TypeClass, key):
"top_n": chat_request.top_n if chat_request.top_n else 1,
}
+ kwargs = {
+ "search_type": chat_request.search_type if chat_request.search_type else "similarity",
+ "k": chat_request.k if chat_request.k else 4,
+ "distance_threshold": chat_request.distance_threshold if chat_request.distance_threshold else None,
+ "fetch_k": chat_request.fetch_k if chat_request.fetch_k else 20,
+ "lambda_mult": chat_request.lambda_mult if chat_request.lambda_mult else 0.5,
+ "score_threshold": chat_request.score_threshold if chat_request.score_threshold else 0.2,
+ "top_n": chat_request.top_n if chat_request.top_n else 1,
+ }
result_dict, runtime_graph = await self.megaservice.schedule(
initial_inputs=initial_inputs,
- retriever_parameters=retriever_parameters,
- reranker_parameters=reranker_parameters,
+ **kwargs,
)
else:
- result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs={"text": query})
+ result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs={"input": query})
last_node = runtime_graph.all_leaves()[-1]
response = result_dict[last_node]
diff --git a/DocIndexRetriever/tests/test.py b/DocIndexRetriever/tests/test.py
new file mode 100644
index 0000000000..ba74827fa6
--- /dev/null
+++ b/DocIndexRetriever/tests/test.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+
+
+def search_knowledge_base(query: str) -> str:
+ """Search the knowledge base for a specific query."""
+ url = os.environ.get("RETRIEVAL_TOOL_URL")
+ print(url)
+ proxies = {"http": ""}
+ payload = {"messages": query, "k": 5, "top_n": 2}
+ response = requests.post(url, json=payload, proxies=proxies)
+ print(response)
+ if "documents" in response.json():
+ docs = response.json()["documents"]
+ context = ""
+ for i, doc in enumerate(docs):
+ context += f"Doc[{i+1}]:\n{doc}\n"
+ return context
+ elif "text" in response.json():
+ return response.json()["text"]
+ elif "reranked_docs" in response.json():
+ docs = response.json()["reranked_docs"]
+ context = ""
+ for i, doc in enumerate(docs):
+ context += f"Doc[{i+1}]:\n{doc}\n"
+ return context
+ else:
+ return "Error parsing response from the knowledge base."
+
+
+if __name__ == "__main__":
+ resp = search_knowledge_base("What is OPEA?")
+ # resp = search_knowledge_base("Thriller")
+ print(resp)
From 59ffc84c246f1264ee5f63498eadce7e20ca57ed Mon Sep 17 00:00:00 2001
From: Ying Hu
Date: Sun, 23 Feb 2025 17:38:27 +0800
Subject: [PATCH 013/226] Update README.md of AIPC quick start (#1578)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
.../docker_compose/intel/cpu/aipc/README.md | 78 +++++++++++++++++++
1 file changed, 78 insertions(+)
diff --git a/ChatQnA/docker_compose/intel/cpu/aipc/README.md b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
index 5fd253c623..5a217b1f3b 100644
--- a/ChatQnA/docker_compose/intel/cpu/aipc/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
@@ -2,6 +2,84 @@
This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on AIPC. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+## Quick Start:
+
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the ChatQnA Service.
+
+### Quick Start: 1. Set up Environment Variable
+
+To set up environment variables for deploying ChatQnA services, follow these steps:
+
+```bash
+mkdir ~/OPEA -p
+cd ~/OPEA
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/aipc
+```
+
+1. Set the required environment variables:
+
+ ```bash
+ export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+ ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+ ```bash
+ export https_proxy="Your_HTTPs_Proxy"
+ # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+ export no_proxy=$no_proxy,chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
+ ```
+
+3. Set up other environment variables
+
+ By default, llama3.2 is used for LLM serving, the default model can be changed to other LLM models. Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.
+ To change the default model defined in set_env.sh, overwrite it by exporting OLLAMA_MODEL to the new model or by modifying set_env.sh.
+ For example, change to using the following model.
+
+ ```bash
+ export OLLAMA_MODEL="deepseek-r1:8b"
+ ```
+
+ to use the [DeepSeek-R1-Distill-Llama-8B model](https://ollama.com/library/deepseek-r1:8b)
+
+ ```bash
+ source ./set_env.sh
+ ```
+
+### Quick Start: 2. Run Docker Compose
+
+```bash
+ docker compose up -d
+```
+
+It will take several minutes to automatically download the docker images
+
+NB: You should build docker image from source by yourself if:
+
+- You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
+- You can't download the docker image.
+- You want to use a specific version of Docker image.
+
+Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below.
+
+### Quick Start:3. Consume the ChatQnA Service
+
+Once the services are up, open the following URL from your browser: http://{host_ip}:80.
+Enter Prompt like What is deep learning?
+
+Or if you prefer to try only on the localhost machine, then try
+
+```bash
+curl http://${host_ip}:8888/v1/chatqna \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": "What is deep learning?"
+ }'
+```
+
## 🚀 Build Docker Images
First of all, you need to build Docker Images locally and install the python package of it.
From 4bd9c1a256ef86a96f19c7a96d892165930ba9e2 Mon Sep 17 00:00:00 2001
From: Eero Tamminen
Date: Tue, 25 Feb 2025 06:45:21 +0200
Subject: [PATCH 014/226] Fix "OpenAI" & "response" spelling (#1561)
Signed-off-by: Chingis Yundunov
---
AgentQnA/docker_compose/intel/cpu/xeon/README.md | 2 +-
ChatQnA/chatqna.py | 2 +-
GraphRAG/graphrag.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/README.md b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
index dde535f2ae..a2abfc7ce9 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
@@ -60,7 +60,7 @@ This example showcases a hierarchical multi-agent system for question-answering
```
6. Launch multi-agent system
- The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use openAI GPT-4o-mini as LLM.
+ The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use OpenAI GPT-4o-mini as LLM.
```
cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index 104c6fdb13..afb9706cb2 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -167,7 +167,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
def align_generator(self, gen, **kwargs):
- # openai reaponse format
+ # OpenAI response format
# b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
for line in gen:
line = line.decode("utf-8")
diff --git a/GraphRAG/graphrag.py b/GraphRAG/graphrag.py
index 4eafaab244..6433e410ad 100644
--- a/GraphRAG/graphrag.py
+++ b/GraphRAG/graphrag.py
@@ -110,7 +110,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
def align_generator(self, gen, **kwargs):
- # openai reaponse format
+ # OpenAI response format
# b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
print("generator in align generator:\n", gen)
for line in gen:
From 2abf73842ab5e983ccb265c028612553b85cabc3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 25 Feb 2025 14:32:03 +0800
Subject: [PATCH 015/226] Bump gradio from 5.5.0 to 5.11.0 in /DocSum/ui/gradio
(#1576)
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Liang Lv
Signed-off-by: Chingis Yundunov
---
DocSum/ui/gradio/requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/DocSum/ui/gradio/requirements.txt b/DocSum/ui/gradio/requirements.txt
index 9086603d04..5824f07218 100644
--- a/DocSum/ui/gradio/requirements.txt
+++ b/DocSum/ui/gradio/requirements.txt
@@ -1,5 +1,5 @@
docx2txt
-gradio==5.5.0
+gradio==5.11.0
langchain_community
moviepy==1.0.3
numpy==1.26.4
From 8e8d296965f3aac88ea948d93e2bc4b3d4e51089 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 016/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 9aba6d05c9b1a6f1bd7f332167171d79373e39ea Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 017/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From 24f886f4057c9739c4bef3d655a159608420d8cd Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 018/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 2e1b401ad5edb84b769ef8a9ac52062b2213c720 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 019/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From c9a78079957c580116093d234e1ee481ec196951 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 020/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From b2e1523b4b8975582f40594feeb476cf192efe75 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 021/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From 947aa8129a2ed40e7873f926d74c341a765f6ce6 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:02:03 +0700
Subject: [PATCH 022/226] DocSum - add files for deploy app with ROCm vLLM
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 ++
.../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++
.../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++
.../amd/gpu/rocm-vllm/set_env.sh | 16 ++
DocSum/docker_image_build/build.yaml | 9 +
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++
6 files changed, 574 insertions(+)
create mode 100644 DocSum/Dockerfile-vllm-rocm
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
new file mode 100644
index 0000000000..f0e8a8743a
--- /dev/null
+++ b/DocSum/Dockerfile-vllm-rocm
@@ -0,0 +1,18 @@
+FROM rocm/vllm-dev:main
+
+# Set the working directory
+WORKDIR /workspace
+
+# Copy the api_server.py into the image
+ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV WILM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
new file mode 100644
index 0000000000..4d41a5cd31
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
@@ -0,0 +1,175 @@
+# Build and deploy DocSum Application on AMD GPU (ROCm)
+
+## Build images
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+### 1. Build LLM Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following four Docker Images:
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/DocSum/
+docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+
+### 4. Build React UI Docker Image
+
+Build the frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
+
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+1. `opea/llm-docsum-tgi:latest`
+2. `opea/docsum:latest`
+3. `opea/docsum-ui:latest`
+4. `opea/docsum-react-ui:latest`
+
+## 🚀 Start Microservices and MegaService
+
+### Required Models
+
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
+For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variables
+
+Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+
+```bash
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${host_ip}
+export DOCSUM_TGI_SERVICE_PORT="18882"
+export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export DOCSUM_LLM_SERVER_PORT="8008"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+ - /dev/dri/card0:/dev/dri/card0
+ - /dev/dri/renderD128:/dev/dri/renderD128
+ - /dev/dri/card1:/dev/dri/card1
+ - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+### Start Microservice Docker Containers
+
+```bash
+cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
+docker compose up -d
+```
+
+### Validate Microservices
+
+1. TGI Service
+
+ ```bash
+ curl http://${host_ip}:8008/generate \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM Microservice
+
+ ```bash
+ curl http://${host_ip}:9000/v1/docsum \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. MegaService
+
+ ```bash
+ curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
+ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
+ }'
+ ```
+
+## 🚀 Launch the Svelte UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+
+
+
+Here is an example for summarizing a article.
+
+
+
+## 🚀 Launch the React UI (Optional)
+
+To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
+
+```yaml
+docsum-rocm-react-ui-server:
+ image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+ container_name: docsum-rocm-react-ui-server
+ depends_on:
+ - docsum-rocm-backend-server
+ ports:
+ - "5174:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+```
+
+Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
+
+
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
new file mode 100644
index 0000000000..037aa06395
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
@@ -0,0 +1,107 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ docsum-vllm-service:
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
+ container_name: docsum-vllm-service
+ ports:
+ - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ WILM_USE_TRITON_FLASH_ATTENTION: 0
+ PYTORCH_JIT: 0
+ volumes:
+ - "./data:/data"
+ shm_size: 20G
+ devices:
+ - /dev/kfd:/dev/kfd
+ - /dev/dri/:/dev/dri/
+ cap_add:
+ - SYS_PTRACE
+ group_add:
+ - video
+ security_opt:
+ - seccomp:unconfined
+ - apparmor=unconfined
+ command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+ ipc: host
+
+ docsum-llm-server:
+ image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ container_name: docsum-llm-server
+ depends_on:
+ - docsum-vllm-service
+ ports:
+ - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
+ ipc: host
+ cap_add:
+ - SYS_PTRACE
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
+ HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
+ LOGFLAG: ${DOCSUM_LOGFLAG:-False}
+ MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
+ MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
+ restart: unless-stopped
+
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - "${DOCSUM_WHISPER_PORT:-7066}:7066"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+
+ docsum-backend-server:
+ image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+ container_name: docsum-backend-server
+ depends_on:
+ - docsum-tgi-service
+ - docsum-llm-server
+ ports:
+ - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ MEGA_SERVICE_HOST_IP: ${HOST_IP}
+ LLM_SERVICE_HOST_IP: ${HOST_IP}
+ ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+
+ docsum-gradio-ui:
+ image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+ container_name: docsum-ui-server
+ depends_on:
+ - docsum-backend-server
+ ports:
+ - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
+ environment:
+ no_proxy: ${no_proxy}
+ https_proxy: ${https_proxy}
+ http_proxy: ${http_proxy}
+ BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
new file mode 100644
index 0000000000..43e71e0fbf
--- /dev/null
+++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+export HOST_IP=""
+export DOCSUM_MAX_INPUT_TOKENS=2048
+export DOCSUM_MAX_TOTAL_TOKENS=4096
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 095fd28c93..dc0d546189 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,3 +47,12 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+ vllm_rocm:
+ build:
+ args:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ no_proxy: ${no_proxy}
+ context: ../
+ dockerfile: ./Dockerfile-vllm-rocm
+ image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
new file mode 100644
index 0000000000..d0919a019a
--- /dev/null
+++ b/DocSum/tests/test_compose_on_rocm_vllm.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export HOST_IP=${ip_address}
+export DOCSUM_VLLM_SERVICE_PORT="8008"
+export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export DOCSUM_LLM_SERVER_PORT="9000"
+export DOCSUM_WHISPER_PORT="7066"
+export DOCSUM_BACKEND_SERVER_PORT="8888"
+export DOCSUM_FRONTEND_PORT="5173"
+export MEGA_SERVICE_HOST_IP=${HOST_IP}
+export LLM_SERVICE_HOST_IP=${HOST_IP}
+export ASR_SERVICE_HOST_IP=${HOST_IP}
+export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
+ sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
+ # Start Docker Containers
+ docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+ echo "==========================================="
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+ echo "CONTENT==> $CONTENT"
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+get_base64_str() {
+ local file_name=$1
+ base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+ local document_type=$1
+ case $document_type in
+ ("text")
+ echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+ ;;
+ ("audio")
+ get_base64_str "$WORKPATH/tests/data/test.wav"
+ ;;
+ ("video")
+ get_base64_str "$WORKPATH/tests/data/test.mp4"
+ ;;
+ (*)
+ echo "Invalid document type" >&2
+ exit 1
+ ;;
+ esac
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # whisper microservice
+ ulimit -s 65536
+ validate_services \
+ "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
+ '{"asr_result":"well"}' \
+ "whisper-service" \
+ "whisper-service" \
+ "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+ # vLLM service
+ validate_services \
+ "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
+ "generated_text" \
+ "docsum-vllm-service" \
+ "docsum-vllm-service" \
+ '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+
+ # llm microservice
+ validate_services \
+ "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
+ "text" \
+ "docsum-llm-server" \
+ "docsum-llm-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="docsum-backend-server"
+ local DOCKER_NAME="docsum-backend-server"
+ local EXPECTED_RESULT="[DONE]"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${host_ip}:8888/v1/docsum"
+ local DATA_TYPE="type=text"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_megaservice_json() {
+ # Curl the Mega Service
+ echo ""
+ echo ">>> Checking text data with Content-Type: application/json"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+ echo ">>> Checking audio data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
+
+ echo ">>> Checking video data"
+ validate_services \
+ "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
+ "[DONE]" \
+ "docsum-backend-server" \
+ "docsum-backend-server" \
+ "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
+
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
+ docker compose stop && docker compose rm -f
+}
+
+function main() {
+ echo "==========================================="
+ echo ">>>> Stopping any running Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ if [[ "$IMAGE_REPO" == "opea" ]]; then
+ echo ">>>> Building Docker images..."
+ build_docker_images
+ fi
+
+ echo "==========================================="
+ echo ">>>> Starting Docker services..."
+ start_services
+
+ echo "==========================================="
+ echo ">>>> Validating microservices..."
+ validate_microservices
+
+ echo "==========================================="
+ echo ">>>> Validating megaservice..."
+ validate_megaservice
+ echo ">>>> Validating validate_megaservice_json..."
+ validate_megaservice_json
+
+ echo "==========================================="
+ echo ">>>> Stopping Docker containers..."
+ stop_docker
+
+ echo "==========================================="
+ echo ">>>> Pruning Docker system..."
+ echo y | docker system prune
+ echo ">>>> Docker system pruned successfully."
+ echo "==========================================="
+}
+
+main
From 6b2b29703e30add06fac3e955f4410b86ed35a69 Mon Sep 17 00:00:00 2001
From: Chingis Yundunov
Date: Thu, 13 Feb 2025 10:07:05 +0700
Subject: [PATCH 023/226] DocSum - fix main
Signed-off-by: Chingis Yundunov
Signed-off-by: Chingis Yundunov
---
DocSum/Dockerfile-vllm-rocm | 18 --
.../amd/gpu/rocm-vllm/README.md | 175 ------------
.../amd/gpu/rocm-vllm/compose.yaml | 107 --------
.../amd/gpu/rocm-vllm/set_env.sh | 16 --
DocSum/docker_image_build/build.yaml | 9 -
DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------
6 files changed, 574 deletions(-)
delete mode 100644 DocSum/Dockerfile-vllm-rocm
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh
diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm
deleted file mode 100644
index f0e8a8743a..0000000000
--- a/DocSum/Dockerfile-vllm-rocm
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM rocm/vllm-dev:main
-
-# Set the working directory
-WORKDIR /workspace
-
-# Copy the api_server.py into the image
-ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
-
-# Expose the port used by the API server
-EXPOSE 8011
-
-# Set environment variables
-ENV HUGGINGFACE_HUB_CACHE=/workspace
-ENV WILM_USE_TRITON_FLASH_ATTENTION=0
-ENV PYTORCH_JIT=0
-
-# Set the entrypoint to the api_server.py script
-ENTRYPOINT ["python3", "/workspace/api_server.py"]
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
deleted file mode 100644
index 4d41a5cd31..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Build and deploy DocSum Application on AMD GPU (ROCm)
-
-## Build images
-
-## 🚀 Build Docker Images
-
-First of all, you need to build Docker Images locally and install the python package of it.
-
-### 1. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following four Docker Images:
-
-### 2. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-
-### 4. Build React UI Docker Image
-
-Build the frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/DocSum/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-
-docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
-## 🚀 Start Microservices and MegaService
-
-### Required Models
-
-Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model.
-For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${host_ip}
-export DOCSUM_TGI_SERVICE_PORT="18882"
-export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export DOCSUM_LLM_SERVER_PORT="8008"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
-```
-
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
- - /dev/dri/card0:/dev/dri/card0
- - /dev/dri/renderD128:/dev/dri/renderD128
- - /dev/dri/card1:/dev/dri/card1
- - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. TGI Service
-
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
-
-2. LLM Microservice
-
- ```bash
- curl http://${host_ip}:9000/v1/docsum \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
-
-3. MegaService
-
- ```bash
- curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{
- "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false
- }'
- ```
-
-## 🚀 Launch the Svelte UI
-
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
-
-
-
-Here is an example for summarizing a article.
-
-
-
-## 🚀 Launch the React UI (Optional)
-
-To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below:
-
-```yaml
-docsum-rocm-react-ui-server:
- image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
- container_name: docsum-rocm-react-ui-server
- depends_on:
- - docsum-rocm-backend-server
- ports:
- - "5174:80"
- environment:
- - no_proxy=${no_proxy}
- - https_proxy=${https_proxy}
- - http_proxy=${http_proxy}
- - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-```
-
-Open this URL `http://{host_ip}:5175` in your browser to access the frontend.
-
-
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
deleted file mode 100644
index 037aa06395..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
- docsum-vllm-service:
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
- container_name: docsum-vllm-service
- ports:
- - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011"
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
- WILM_USE_TRITON_FLASH_ATTENTION: 0
- PYTORCH_JIT: 0
- volumes:
- - "./data:/data"
- shm_size: 20G
- devices:
- - /dev/kfd:/dev/kfd
- - /dev/dri/:/dev/dri/
- cap_add:
- - SYS_PTRACE
- group_add:
- - video
- security_opt:
- - seccomp:unconfined
- - apparmor=unconfined
- command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
- ipc: host
-
- docsum-llm-server:
- image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- container_name: docsum-llm-server
- depends_on:
- - docsum-vllm-service
- ports:
- - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000"
- ipc: host
- cap_add:
- - SYS_PTRACE
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}"
- HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
- LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
- LOGFLAG: ${DOCSUM_LOGFLAG:-False}
- MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS}
- MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS}
- restart: unless-stopped
-
- whisper-service:
- image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- container_name: whisper-service
- ports:
- - "${DOCSUM_WHISPER_PORT:-7066}:7066"
- ipc: host
- environment:
- no_proxy: ${no_proxy}
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- restart: unless-stopped
-
- docsum-backend-server:
- image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
- container_name: docsum-backend-server
- depends_on:
- - docsum-tgi-service
- - docsum-llm-server
- ports:
- - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- MEGA_SERVICE_HOST_IP: ${HOST_IP}
- LLM_SERVICE_HOST_IP: ${HOST_IP}
- ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
- ipc: host
- restart: always
-
- docsum-gradio-ui:
- image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
- container_name: docsum-ui-server
- depends_on:
- - docsum-backend-server
- ports:
- - "${DOCSUM_FRONTEND_PORT:-5173}:5173"
- environment:
- no_proxy: ${no_proxy}
- https_proxy: ${https_proxy}
- http_proxy: ${http_proxy}
- BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT}
- ipc: host
- restart: always
-
-networks:
- default:
- driver: bridge
diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
deleted file mode 100644
index 43e71e0fbf..0000000000
--- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-export HOST_IP=""
-export DOCSUM_MAX_INPUT_TOKENS=2048
-export DOCSUM_MAX_TOTAL_TOKENS=4096
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=""
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index dc0d546189..095fd28c93 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -47,12 +47,3 @@ services:
dockerfile: comps/llms/src/doc-summarization/Dockerfile
extends: docsum
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
- vllm_rocm:
- build:
- args:
- http_proxy: ${http_proxy}
- https_proxy: ${https_proxy}
- no_proxy: ${no_proxy}
- context: ../
- dockerfile: ./Dockerfile-vllm-rocm
- image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest}
diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh
deleted file mode 100644
index d0919a019a..0000000000
--- a/DocSum/tests/test_compose_on_rocm_vllm.sh
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
-export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export HOST_IP=${ip_address}
-export DOCSUM_VLLM_SERVICE_PORT="8008"
-export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export DOCSUM_LLM_SERVER_PORT="9000"
-export DOCSUM_WHISPER_PORT="7066"
-export DOCSUM_BACKEND_SERVER_PORT="8888"
-export DOCSUM_FRONTEND_PORT="5173"
-export MEGA_SERVICE_HOST_IP=${HOST_IP}
-export LLM_SERVICE_HOST_IP=${HOST_IP}
-export ASR_SERVICE_HOST_IP=${HOST_IP}
-export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-
-function build_docker_images() {
- opea_branch=${opea_branch:-"main"}
- # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
- if [[ "${opea_branch}" != "main" ]]; then
- cd $WORKPATH
- OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
- NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
- find . -type f -name "Dockerfile*" | while read -r file; do
- echo "Processing file: $file"
- sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
- done
- fi
-
- cd $WORKPATH/docker_image_build
- git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
- echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper"
- docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
- docker images && sleep 1s
-}
-
-function start_services() {
- cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm
- sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env
- # Start Docker Containers
- docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log
- sleep 1m
-}
-
-function validate_services() {
- local URL="$1"
- local EXPECTED_RESULT="$2"
- local SERVICE_NAME="$3"
- local DOCKER_NAME="$4"
- local INPUT_DATA="$5"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
- echo "==========================================="
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
- echo "CONTENT==> $CONTENT"
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
-
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-get_base64_str() {
- local file_name=$1
- base64 -w 0 "$file_name"
-}
-
-# Function to generate input data for testing based on the document type
-input_data_for_test() {
- local document_type=$1
- case $document_type in
- ("text")
- echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
- ;;
- ("audio")
- get_base64_str "$WORKPATH/tests/data/test.wav"
- ;;
- ("video")
- get_base64_str "$WORKPATH/tests/data/test.mp4"
- ;;
- (*)
- echo "Invalid document type" >&2
- exit 1
- ;;
- esac
-}
-
-function validate_microservices() {
- # Check if the microservices are running correctly.
-
- # whisper microservice
- ulimit -s 65536
- validate_services \
- "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \
- '{"asr_result":"well"}' \
- "whisper-service" \
- "whisper-service" \
- "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
- # vLLM service
- validate_services \
- "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \
- "generated_text" \
- "docsum-vllm-service" \
- "docsum-vllm-service" \
- '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-
- # llm microservice
- validate_services \
- "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \
- "text" \
- "docsum-llm-server" \
- "docsum-llm-server" \
- '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
-}
-
-function validate_megaservice() {
- local SERVICE_NAME="docsum-backend-server"
- local DOCKER_NAME="docsum-backend-server"
- local EXPECTED_RESULT="[DONE]"
- local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${host_ip}:8888/v1/docsum"
- local DATA_TYPE="type=text"
-
- local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
-
- if [ "$HTTP_STATUS" -eq 200 ]; then
- echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
- local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
- if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
- echo "[ $SERVICE_NAME ] Content is as expected."
- else
- echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- else
- echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
- docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
- exit 1
- fi
- sleep 1s
-}
-
-function validate_megaservice_json() {
- # Curl the Mega Service
- echo ""
- echo ">>> Checking text data with Content-Type: application/json"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
-
- echo ">>> Checking audio data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}"
-
- echo ">>> Checking video data"
- validate_services \
- "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \
- "[DONE]" \
- "docsum-backend-server" \
- "docsum-backend-server" \
- "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}"
-
-}
-
-function stop_docker() {
- cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/
- docker compose stop && docker compose rm -f
-}
-
-function main() {
- echo "==========================================="
- echo ">>>> Stopping any running Docker containers..."
- stop_docker
-
- echo "==========================================="
- if [[ "$IMAGE_REPO" == "opea" ]]; then
- echo ">>>> Building Docker images..."
- build_docker_images
- fi
-
- echo "==========================================="
- echo ">>>> Starting Docker services..."
- start_services
-
- echo "==========================================="
- echo ">>>> Validating microservices..."
- validate_microservices
-
- echo "==========================================="
- echo ">>>> Validating megaservice..."
- validate_megaservice
- echo ">>>> Validating validate_megaservice_json..."
- validate_megaservice_json
-
- echo "==========================================="
- echo ">>>> Stopping Docker containers..."
- stop_docker
-
- echo "==========================================="
- echo ">>>> Pruning Docker system..."
- echo y | docker system prune
- echo ">>>> Docker system pruned successfully."
- echo "==========================================="
-}
-
-main
From f7b3be60ecf572b50a4c0bf3a5eee70513bee904 Mon Sep 17 00:00:00 2001
From: Spycsh <39623753+Spycsh@users.noreply.github.com>
Date: Thu, 27 Feb 2025 09:25:49 +0800
Subject: [PATCH 024/226] Align mongo related image names with comps (#1543)
- chathistory-mongo-server -> chathistory-mongo (except container names)
- feedbackmanagement -> feedbackmanagement-mongo
- promptregistry-server/promptregistry-mongo-server -> promptregistry-mongo (except container names)
Signed-off-by: Spycsh
Signed-off-by: Chingis Yundunov
---
ProductivitySuite/docker_compose/intel/cpu/xeon/README.md | 4 ++--
.../docker_compose/intel/cpu/xeon/compose.yaml | 4 ++--
ProductivitySuite/docker_image_build/build.yaml | 8 ++++----
docker_images_list.md | 2 +-
4 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
index 5ab4816096..ce7a874b38 100644
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -45,13 +45,13 @@ docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_p
### 6. Build Prompt Registry Image
```bash
-docker build -t opea/promptregistry-mongo-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/src/Dockerfile .
+docker build -t opea/promptregistry-mongo:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/src/Dockerfile .
```
### 7. Build Chat History Image
```bash
-docker build -t opea/chathistory-mongo-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/src/Dockerfile .
+docker build -t opea/chathistory-mongo:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/src/Dockerfile .
cd ..
```
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
index ee7d23a640..149109e4b7 100644
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
@@ -310,7 +310,7 @@ services:
command: mongod --quiet --logpath /dev/null
chathistory-mongo:
- image: ${REGISTRY:-opea}/chathistory-mongo-server:${TAG:-latest}
+ image: ${REGISTRY:-opea}/chathistory-mongo:${TAG:-latest}
container_name: chathistory-mongo-server
ports:
- "6012:6012"
@@ -326,7 +326,7 @@ services:
restart: unless-stopped
promptregistry-mongo:
- image: ${REGISTRY:-opea}/promptregistry-mongo-server:${TAG:-latest}
+ image: ${REGISTRY:-opea}/promptregistry-mongo:${TAG:-latest}
container_name: promptregistry-mongo-server
ports:
- "6018:6018"
diff --git a/ProductivitySuite/docker_image_build/build.yaml b/ProductivitySuite/docker_image_build/build.yaml
index 807aa1242c..9bfc65e362 100644
--- a/ProductivitySuite/docker_image_build/build.yaml
+++ b/ProductivitySuite/docker_image_build/build.yaml
@@ -41,18 +41,18 @@ services:
dockerfile: comps/dataprep/src/Dockerfile
extends: chatqna
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
- promptregistry-mongo-server:
+ promptregistry-mongo:
build:
context: GenAIComps
dockerfile: comps/prompt_registry/src/Dockerfile
extends: chatqna
- image: ${REGISTRY:-opea}/promptregistry-mongo-server:${TAG:-latest}
- chathistory-mongo-server:
+ image: ${REGISTRY:-opea}/promptregistry-mongo:${TAG:-latest}
+ chathistory-mongo:
build:
context: GenAIComps
dockerfile: comps/chathistory/src/Dockerfile
extends: chatqna
- image: ${REGISTRY:-opea}/chathistory-mongo-server:${TAG:-latest}
+ image: ${REGISTRY:-opea}/chathistory-mongo:${TAG:-latest}
productivity-suite-react-ui-server:
build:
context: ../ui
diff --git a/docker_images_list.md b/docker_images_list.md
index 242aad57c0..ab6349fd97 100644
--- a/docker_images_list.md
+++ b/docker_images_list.md
@@ -56,7 +56,7 @@ Take ChatQnA for example. ChatQnA is a chatbot application service based on the
| [opea/agent-ui](https://hub.docker.com/r/opea/agent-ui) | [Link](https://github.com/opea-project/GenAIExamples/blob/main/AgentQnA/ui/docker/Dockerfile) | The docker image exposed the OPEA agent microservice UI entry for GenAI application use |
| [opea/asr](https://hub.docker.com/r/opea/asr) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/asr/src/Dockerfile) | The docker image exposed the OPEA Audio-Speech-Recognition microservice for GenAI application use |
| [opea/animation](https://hub.docker.com/r/opea/animation) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/animation/src/Dockerfile) | The purpose of the Docker image is to expose the OPEA Avatar Animation microservice for GenAI application use. |
-| [opea/chathistory-mongo-server](https://hub.docker.com/r/opea/chathistory-mongo-server) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/chathistory/src/Dockerfile) | The docker image exposes OPEA Chat History microservice which based on MongoDB database, designed to allow user to store, retrieve and manage chat conversations |
+| [opea/chathistory-mongo](https://hub.docker.com/r/opea/chathistory-mongo) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/chathistory/src/Dockerfile) | The docker image exposes OPEA Chat History microservice which based on MongoDB database, designed to allow user to store, retrieve and manage chat conversations |
| [opea/dataprep](https://hub.docker.com/r/opea/dataprep) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/dataprep/src/Dockerfile) | The docker image exposed the OPEA dataprep microservice for GenAI application use |
| [opea/embedding](https://hub.docker.com/r/opea/embedding) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/embeddings/src/Dockerfile) | The docker image exposed the OPEA mosec embedding microservice for GenAI application use |
| [opea/embedding-multimodal-clip](https://hub.docker.com/r/opea/embedding-multimodal-clip) | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/third_parties/clip/src/Dockerfile) | The docker image exposed the OPEA mosec embedding microservice base on Langchain framework for GenAI application use |
From 23fbd2fb44ddf511fdbbcdfea828d975b9ed3eea Mon Sep 17 00:00:00 2001
From: Artem Astafev
Date: Thu, 27 Feb 2025 14:26:45 +0700
Subject: [PATCH 025/226] Fix ChatQnA ROCm compose Readme file and absolute
path for ROCM CI test (#1159)
Signed-off-by: Artem Astafev
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/amd/gpu/rocm/README.md | 2 +-
ChatQnA/tests/test_compose_on_rocm.sh | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/README.md b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
index cfd9245541..1bb82838c0 100644
--- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
@@ -1,4 +1,4 @@
-# Build and deploy CodeGen Application on AMD GPU (ROCm)
+# Build and deploy ChatQnA Application on AMD GPU (ROCm)
## Build MegaService of ChatQnA on AMD ROCm GPU
diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh
index 9a25392997..d6dc5dfae1 100644
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -45,7 +45,7 @@ export CHATQNA_RERANK_SERVICE_HOST_IP=${HOST_IP}
export CHATQNA_LLM_SERVICE_HOST_IP=${HOST_IP}
export CHATQNA_NGINX_PORT=80
export CHATQNA_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export PATH="/home/huggingface/miniconda3/bin:$PATH"
+export PATH="~/miniconda3/bin:$PATH"
function build_docker_images() {
opea_branch=${opea_branch:-"main"}
From 4b47c3ec25543246a9ae015271927b5a05950273 Mon Sep 17 00:00:00 2001
From: XinyaoWa
Date: Thu, 27 Feb 2025 23:32:29 +0800
Subject: [PATCH 026/226] Fix async in chatqna bug (#1589)
Algin async with comps: related PR: opea-project/GenAIComps#1300
Signed-off-by: Xinyao Wang
Signed-off-by: Chingis Yundunov
---
ChatQnA/chatqna.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index afb9706cb2..e25ab4d39a 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -166,10 +166,10 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
return next_data
-def align_generator(self, gen, **kwargs):
+async def align_generator(self, gen, **kwargs):
# OpenAI response format
# b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
- for line in gen:
+ async for line in gen:
line = line.decode("utf-8")
start = line.find("{")
end = line.rfind("}") + 1
From ed01594f26a9a75f149df6f5e2c7498beff1dbe1 Mon Sep 17 00:00:00 2001
From: "chen, suyue"
Date: Fri, 28 Feb 2025 10:30:54 +0800
Subject: [PATCH 027/226] Fix benchmark scripts (#1517)
- Align benchmark default config:
1. Update default helm charts version.
2. Add `# mandatory` comment.
3. Update default model ID for LLM.
- Fix deploy issue:
1. Support different `replicaCount` for w/ w/o rerank test.
2. Add `max_num_seqs` for vllm.
3. Add resource setting for tune mode.
- Fix Benchmark issue:
1. Update `user_queries` and `concurrency` setting.
2. Remove invalid parameters.
3. Fix `dataset` and `prompt` setting. And dataset ingest into db.
5. Fix the benchmark hang issue with large user queries. Update `"processes": 16` will fix this issue.
6. Update the eval_path setting logical.
- Optimize benchmark readme.
- Optimize the log path to make the logs more readable.
Signed-off-by: chensuyue
Signed-off-by: Cathy Zhang
Signed-off-by: letonghan
Signed-off-by: Chingis Yundunov
---
ChatQnA/benchmark_chatqna.yaml | 101 +++++++++-----
README-deploy-benchmark.md | 143 ++++++++++++++++++--
benchmark.py | 233 ++++++++++++++++++++++++---------
deploy.py | 150 +++++++++++++--------
deploy_and_benchmark.py | 220 ++++++++++++++++++++++++-------
requirements.txt | 1 +
6 files changed, 641 insertions(+), 207 deletions(-)
diff --git a/ChatQnA/benchmark_chatqna.yaml b/ChatQnA/benchmark_chatqna.yaml
index c608b8afbf..407d555ceb 100644
--- a/ChatQnA/benchmark_chatqna.yaml
+++ b/ChatQnA/benchmark_chatqna.yaml
@@ -3,55 +3,89 @@
deploy:
device: gaudi
- version: 1.1.0
+ version: 1.2.0
modelUseHostPath: /mnt/models
- HUGGINGFACEHUB_API_TOKEN: ""
+ HUGGINGFACEHUB_API_TOKEN: "" # mandatory
node: [1, 2, 4, 8]
namespace: ""
+ timeout: 1000 # timeout in seconds for services to be ready, default 30 minutes
+ interval: 5 # interval in seconds between service ready checks, default 5 seconds
services:
backend:
- instance_num: [2, 2, 4, 8]
- cores_per_instance: ""
- memory_capacity: ""
+ resources:
+ enabled: False
+ cores_per_instance: "16"
+ memory_capacity: "8000Mi"
+ replicaCount: [1, 2, 4, 8]
teirerank:
enabled: True
model_id: ""
+ resources:
+ enabled: False
+ cards_per_instance: 1
replicaCount: [1, 1, 1, 1]
- cards_per_instance: 1
tei:
model_id: ""
+ resources:
+ enabled: False
+ cores_per_instance: "80"
+ memory_capacity: "20000Mi"
replicaCount: [1, 2, 4, 8]
- cores_per_instance: ""
- memory_capacity: ""
llm:
- engine: tgi
- model_id: ""
- replicaCount: [7, 15, 31, 63]
- max_batch_size: [1, 2, 4, 8]
- max_input_length: ""
- max_total_tokens: ""
- max_batch_total_tokens: ""
- max_batch_prefill_tokens: ""
- cards_per_instance: 1
+ engine: vllm # or tgi
+ model_id: "meta-llama/Meta-Llama-3-8B-Instruct" # mandatory
+ replicaCount:
+ with_teirerank: [7, 15, 31, 63] # When teirerank.enabled is True
+ without_teirerank: [8, 16, 32, 64] # When teirerank.enabled is False
+ resources:
+ enabled: False
+ cards_per_instance: 1
+ model_params:
+ vllm: # VLLM specific parameters
+ batch_params:
+ enabled: True
+ max_num_seqs: [1, 2, 4, 8] # Each value triggers an LLM service upgrade
+ token_params:
+ enabled: False
+ max_input_length: ""
+ max_total_tokens: ""
+ max_batch_total_tokens: ""
+ max_batch_prefill_tokens: ""
+ tgi: # TGI specific parameters
+ batch_params:
+ enabled: True
+ max_batch_size: [1, 2, 4, 8] # Each value triggers an LLM service upgrade
+ token_params:
+ enabled: False
+ max_input_length: "1280"
+ max_total_tokens: "2048"
+ max_batch_total_tokens: "65536"
+ max_batch_prefill_tokens: "4096"
data-prep:
+ resources:
+ enabled: False
+ cores_per_instance: ""
+ memory_capacity: ""
replicaCount: [1, 1, 1, 1]
- cores_per_instance: ""
- memory_capacity: ""
retriever-usvc:
- replicaCount: [2, 2, 4, 8]
- cores_per_instance: ""
- memory_capacity: ""
+ resources:
+ enabled: False
+ cores_per_instance: "8"
+ memory_capacity: "8000Mi"
+ replicaCount: [1, 2, 4, 8]
redis-vector-db:
+ resources:
+ enabled: False
+ cores_per_instance: ""
+ memory_capacity: ""
replicaCount: [1, 1, 1, 1]
- cores_per_instance: ""
- memory_capacity: ""
chatqna-ui:
replicaCount: [1, 1, 1, 1]
@@ -61,22 +95,17 @@ deploy:
benchmark:
# http request behavior related fields
- concurrency: [1, 2, 4]
- totoal_query_num: [2048, 4096]
- duration: [5, 10] # unit minutes
- query_num_per_concurrency: [4, 8, 16]
- possion: True
- possion_arrival_rate: 1.0
+ user_queries: [640]
+ concurrency: [128]
+ load_shape_type: "constant" # "constant" or "poisson"
+ poisson_arrival_rate: 1.0 # only used when load_shape_type is "poisson"
warmup_iterations: 10
seed: 1024
# workload, all of the test cases will run for benchmark
- test_cases:
- - chatqnafixed
- - chatqna_qlist_pubmed:
- dataset: pub_med10 # pub_med10, pub_med100, pub_med1000
- user_queries: [1, 2, 4]
- query_token_size: 128 # if specified, means fixed query token size will be sent out
+ bench_target: [chatqnafixed, chatqna_qlist_pubmed] # specify the bench_target for benchmark
+ dataset: ["/home/sdp/upload_file.txt", "/home/sdp/pubmed_10000.txt"] # specify the absolute path to the dataset file
+ prompt: [10, 1000] # set the prompt length for the chatqna_qlist_pubmed workload, set to 10 for chatqnafixed workload
llm:
# specify the llm output token size
diff --git a/README-deploy-benchmark.md b/README-deploy-benchmark.md
index 4b813cccca..9f1a13f8ff 100644
--- a/README-deploy-benchmark.md
+++ b/README-deploy-benchmark.md
@@ -11,10 +11,9 @@ We aim to run these benchmarks and share them with the OPEA community for three
## Table of Contents
- [Prerequisites](#prerequisites)
-- [Overview](#overview)
- - [Using deploy_and_benchmark.py](#using-deploy_and_benchmark.py-recommended)
- [Data Preparation](#data-preparation)
-- [Configuration](#configuration)
+- [Running Deploy and Benchmark Tests](#running-deploy-and-benchmark-tests)
+- [Troubleshooting](#troubleshooting)
## Prerequisites
@@ -25,8 +24,50 @@ Before running the benchmarks, ensure you have:
- Kubernetes installation: Use [kubespray](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md) or other official Kubernetes installation guides
- (Optional) [Kubernetes set up guide on Intel Gaudi product](https://github.com/opea-project/GenAIInfra/blob/main/README.md#setup-kubernetes-cluster)
-2. **Configuration YAML**
- The configuration file (e.g., `./ChatQnA/benchmark_chatqna.yaml`) consists of two main sections: deployment and benchmarking. Required fields must be filled with valid values (like the Hugging Face token). For all other fields, you can either customize them according to your needs or leave them empty ("") to use the default values from the [helm charts](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts).
+2. **Configuration YAML**
+ The configuration file (e.g., `./ChatQnA/benchmark_chatqna.yaml`) consists of two main sections: deployment and benchmarking. Required fields with `# mandatory` comment must be filled with valid values, such as `HUGGINGFACEHUB_API_TOKEN`. For all other fields, you can either customize them according to our needs or leave them empty ("") to use the default values from the [helm charts](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts).
+
+ **Default Models**:
+
+ - LLM: `meta-llama/Meta-Llama-3-8B-Instruct` (Required: must be specified as it's shared between deployment and benchmarking phases)
+ - Embedding: `BAAI/bge-base-en-v1.5`
+ - Reranking: `BAAI/bge-reranker-base`
+
+ You can customize which models to use by setting the `model_id` field in the corresponding service section. Note that the LLM model must be specified in the configuration as it is used by both deployment and benchmarking processes.
+
+ **Important Notes**:
+
+ - For Gaudi deployments:
+ - LLM service runs on Gaudi devices
+ - If enabled, the reranking service (teirerank) also runs on Gaudi devices
+ - **Llama Model Access**:
+ - Downloading Llama models requires both:
+ 1. HuggingFace API token
+ 2. Special authorization from Meta
+ - Please visit [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to request access
+ - Deployment will fail if model download is unsuccessful due to missing authorization
+
+ **Node and Replica Configuration**:
+
+ ```yaml
+ node: [1, 2, 4, 8] # Number of nodes to deploy
+ replicaCount: [1, 2, 4, 8] # Must align with node configuration
+ ```
+
+ The `replicaCount` values must align with the `node` configuration by index:
+
+ - When deploying on 1 node → uses replicaCount[0] = 1
+ - When deploying on 2 nodes → uses replicaCount[1] = 2
+ - When deploying on 4 nodes → uses replicaCount[2] = 4
+ - When deploying on 8 nodes → uses replicaCount[3] = 8
+
+ Note: Model parameters that accept lists (e.g., `max_batch_size`, `max_num_seqs`) are deployment parameters that affect model service behavior but not the number of service instances. When these parameters are lists, each value will trigger a service upgrade followed by a new round of testing, while maintaining the same number of service instances.
+
+3. **Install required Python packages**
+ Run the following command to install all necessary dependencies:
+ ```bash
+ pip install -r requirements.txt
+ ```
## Data Preparation
@@ -34,36 +75,114 @@ Before running benchmarks, you need to:
1. **Prepare Test Data**
- - Download the retrieval file:
+ - Testing for general benchmark target:
+
+ Download the retrieval file using the command below for data ingestion in RAG:
+
```bash
wget https://github.com/opea-project/GenAIEval/tree/main/evals/benchmark/data/upload_file.txt
```
- - For the `chatqna_qlist_pubmed` test case, prepare `pubmed_${max_lines}.txt` by following this [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/stresscli/README_Pubmed_qlist.md)
+
+ - Testing for pubmed benchmark target:
+
+ For the `chatqna_qlist_pubmed` test case, prepare `pubmed_${max_lines}.txt` by following this [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/stresscli/README_Pubmed_qlist.md)
+
+ After the data is prepared, please update the `absolute path` of this file in the benchmark.yaml file. For example, in the `ChatQnA/benchmark_chatqna.yaml` file, `/home/sdp/upload_file.txt` should be replaced by your file path.
2. **Prepare Model Files (Recommended)**
```bash
pip install -U "huggingface_hub[cli]"
sudo mkdir -p /mnt/models
sudo chmod 777 /mnt/models
- huggingface-cli download --cache-dir /mnt/models Intel/neural-chat-7b-v3-3
+ huggingface-cli download --cache-dir /mnt/models meta-llama/Meta-Llama-3-8B-Instruct
```
-## Overview
+## Running Deploy and Benchmark Tests
The benchmarking process consists of two main components: deployment and benchmarking. We provide `deploy_and_benchmark.py` as a unified entry point that combines both steps.
-### Using deploy_and_benchmark.py (Recommended)
+### Running the Tests
-The script `deploy_and_benchmark.py` serves as the main entry point. Here's an example using ChatQnA configuration (you can replace it with any other example's configuration YAML file):
+The script `deploy_and_benchmark.py` serves as the main entry point. You can use any example's configuration YAML file. Here are examples using ChatQnA configuration:
1. For a specific number of nodes:
```bash
+ # Default OOB (Out of Box) mode
python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml --target-node 1
+
+ # Or specify test mode explicitly
+ python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml --target-node 1 --test-mode [oob|tune]
```
2. For all node configurations:
+
```bash
+ # Default OOB (Out of Box) mode
python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml
+
+ # Or specify test mode explicitly
+ python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml --test-mode [oob|tune]
+ ```
+
+ This will process all node configurations defined in your YAML file.
+
+### Test Modes
+
+The script provides two test modes controlled by the `--test-mode` parameter:
+
+1. **OOB (Out of Box) Mode** - Default
+
+ ```bash
+ --test-mode oob # or omit the parameter
+ ```
+
+ - Uses enabled configurations only:
+ - Resources: Only uses resources when `resources.enabled` is True
+ - Model parameters:
+ - Uses batch parameters when `batch_params.enabled` is True
+ - Uses token parameters when `token_params.enabled` is True
+ - Suitable for basic functionality testing with selected optimizations
+
+2. **Tune Mode**
+ ```bash
+ --test-mode tune
```
- This will iterate through the node list in your configuration YAML file, performing deployment and benchmarking for each node count.
+ - Applies all configurations regardless of enabled status:
+ - Resource-related parameters:
+ - `resources.cores_per_instance`: CPU cores allocation
+ - `resources.memory_capacity`: Memory allocation
+ - `resources.cards_per_instance`: GPU/Accelerator cards allocation
+ - Model parameters:
+ - Batch parameters:
+ - `max_batch_size`: Maximum batch size (TGI engine)
+ - `max_num_seqs`: Maximum number of sequences (vLLM engine)
+ - Token parameters:
+ - `max_input_length`: Maximum input sequence length
+ - `max_total_tokens`: Maximum total tokens per request
+ - `max_batch_total_tokens`: Maximum total tokens in a batch
+ - `max_batch_prefill_tokens`: Maximum tokens in prefill phase
+
+Choose "oob" mode when you want to selectively enable optimizations, or "tune" mode when you want to apply all available optimizations regardless of their enabled status.
+
+### Troubleshooting
+
+**Helm Chart Directory Issues**
+
+- During execution, the script downloads and extracts the Helm chart to a directory named after your example
+- The directory name is derived from your input YAML file path
+ - For example: if your input is `./ChatQnA/benchmark_chatqna.yaml`, the extracted directory will be `chatqna/`
+- In some error cases, this directory might not be properly cleaned up
+- If you encounter deployment issues, check if there's a leftover Helm chart directory:
+
+ ```bash
+ # Example: for ./ChatQnA/benchmark_chatqna.yaml
+ ls -la chatqna/
+
+ # Clean up if needed
+ rm -rf chatqna/
+ ```
+
+- After cleaning up the directory, try running the deployment again
+
+Note: Always ensure there are no leftover Helm chart directories from previous failed runs before starting a new deployment.
diff --git a/benchmark.py b/benchmark.py
index fb20367c08..202a2cb012 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
import os
-import sys
from datetime import datetime
+import requests
import yaml
from evals.benchmark.stresscli.commands.load_test import locust_runtests
from kubernetes import client, config
@@ -25,17 +25,15 @@ def construct_benchmark_config(test_suite_config):
"""Extract relevant data from the YAML based on the specified test cases."""
return {
- "concurrency": test_suite_config.get("concurrency", []),
- "totoal_query_num": test_suite_config.get("user_queries", []),
- "duration:": test_suite_config.get("duration:", []),
- "query_num_per_concurrency": test_suite_config.get("query_num_per_concurrency", []),
- "possion": test_suite_config.get("possion", False),
- "possion_arrival_rate": test_suite_config.get("possion_arrival_rate", 1.0),
+ "user_queries": test_suite_config.get("user_queries", [1]),
+ "concurrency": test_suite_config.get("concurrency", [1]),
+ "load_shape_type": test_suite_config.get("load_shape_type", "constant"),
+ "poisson_arrival_rate": test_suite_config.get("poisson_arrival_rate", 1.0),
"warmup_iterations": test_suite_config.get("warmup_iterations", 10),
"seed": test_suite_config.get("seed", None),
- "test_cases": test_suite_config.get("test_cases", ["chatqnafixed"]),
- "user_queries": test_suite_config.get("user_queries", [1]),
- "query_token_size": test_suite_config.get("query_token_size", 128),
+ "bench_target": test_suite_config.get("bench_target", ["chatqnafixed"]),
+ "dataset": test_suite_config.get("dataset", ""),
+ "prompt": test_suite_config.get("prompt", [10]),
"llm_max_token_size": test_suite_config.get("llm", {}).get("max_token_size", [128]),
}
@@ -97,17 +95,11 @@ def _get_service_ip(service_name, deployment_type="k8s", service_ip=None, servic
return svc_ip, port
-def _create_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params):
+def _create_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params, concurrency=1):
"""Create content for the run.yaml file."""
- # If a load shape includes the parameter concurrent_level,
- # the parameter will be passed to Locust to launch fixed
- # number of simulated users.
- concurrency = 1
- if num_queries >= 0:
- concurrency = max(1, num_queries // test_params["concurrent_level"])
- else:
- concurrency = test_params["concurrent_level"]
+ # calculate the number of concurrent users
+ concurrent_level = int(num_queries // concurrency)
import importlib.util
@@ -116,16 +108,21 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
print(spec)
# get folder path of opea-eval
- eval_path = None
- import pkg_resources
+ eval_path = os.getenv("EVAL_PATH", "")
+ if not eval_path:
+ import pkg_resources
- for dist in pkg_resources.working_set:
- if "opea-eval" in dist.project_name:
- eval_path = dist.location
+ for dist in pkg_resources.working_set:
+ if "opea-eval" in dist.project_name:
+ eval_path = dist.location
+ break
if not eval_path:
- print("Fail to load opea-eval package. Please install it first.")
+ print("Fail to find the opea-eval package. Please set/install it first.")
exit(1)
+ load_shape = test_params["load_shape"]
+ load_shape["params"]["constant"] = {"concurrent_level": concurrent_level}
+
yaml_content = {
"profile": {
"storage": {"hostpath": test_params["test_output_dir"]},
@@ -133,8 +130,9 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
"tool": "locust",
"locustfile": os.path.join(eval_path, "evals/benchmark/stresscli/locust/aistress.py"),
"host": base_url,
+ "run-time": test_params["run_time"],
"stop-timeout": test_params["query_timeout"],
- "processes": 2,
+ "processes": 16, # set to 2 by default
"namespace": test_params["namespace"],
"bench-target": bench_target,
"service-metric-collect": test_params["collect_service_metric"],
@@ -145,42 +143,38 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
"seed": test_params.get("seed", None),
"llm-model": test_params["llm_model"],
"deployment-type": test_params["deployment_type"],
- "load-shape": test_params["load_shape"],
+ "load-shape": load_shape,
},
"runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}],
}
}
- # For the following scenarios, test will stop after the specified run-time
- if test_params["run_time"] is not None and test_phase != "warmup":
- yaml_content["profile"]["global-settings"]["run-time"] = test_params["run_time"]
-
return yaml_content
-def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, base_url, ts) -> str:
+def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, base_url, ts, concurrency=1) -> str:
"""Create a stresscli configuration file and persist it on disk."""
stresscli_confs = []
# Get the workload
- test_cases = test_params["test_cases"]
- for test_case in test_cases:
+ bench_target = test_params["bench_target"]
+ for i, b_target in enumerate(bench_target):
stresscli_conf = {}
- print(test_case)
- if isinstance(test_case, str):
- bench_target = test_case
- elif isinstance(test_case, dict):
- bench_target = list(test_case.keys())[0]
- dataset_conf = test_case[bench_target]
- if bench_target == "chatqna_qlist_pubmed":
- max_lines = dataset_conf["dataset"].split("pub_med")[-1]
- stresscli_conf["envs"] = {"DATASET": f"pubmed_{max_lines}.txt", "MAX_LINES": max_lines}
+ print(f"[OPEA BENCHMARK] 🚀 Running test for {b_target} in phase {test_phase} for {num_queries} queries")
+ if len(test_params["dataset"]) > i:
+ stresscli_conf["envs"] = {"DATASET": test_params["dataset"][i], "MAX_LINES": str(test_params["prompt"][i])}
+ else:
+ stresscli_conf["envs"] = {"MAX_LINES": str(test_params["prompt"][i])}
# Generate the content of stresscli configuration file
- stresscli_yaml = _create_yaml_content(case_params, base_url, bench_target, test_phase, num_queries, test_params)
+ stresscli_yaml = _create_yaml_content(
+ case_params, base_url, b_target, test_phase, num_queries, test_params, concurrency
+ )
# Dump the stresscli configuration file
service_name = case_params.get("service_name")
+ max_output = case_params.get("max_output")
run_yaml_path = os.path.join(
- test_params["test_output_dir"], f"run_{service_name}_{ts}_{test_phase}_{num_queries}_{bench_target}.yaml"
+ test_params["test_output_dir"],
+ f"run_{test_phase}_{service_name}_{num_queries}_{b_target}_{max_output}_{ts}.yaml",
)
with open(run_yaml_path, "w") as yaml_file:
yaml.dump(stresscli_yaml, yaml_file)
@@ -207,15 +201,79 @@ def create_stresscli_confs(service, base_url, test_suite_config, index):
stresscli_confs.extend(_create_stresscli_confs(service, test_suite_config, "benchmark", -1, base_url, index))
else:
# Test stop is controlled by request count
- for user_queries in user_queries_lst:
+ for i, user_query in enumerate(user_queries_lst):
+ concurrency_list = test_suite_config["concurrency"]
+ user_query *= test_suite_config["node_num"]
stresscli_confs.extend(
- _create_stresscli_confs(service, test_suite_config, "benchmark", user_queries, base_url, index)
+ _create_stresscli_confs(
+ service,
+ test_suite_config,
+ "benchmark",
+ user_query,
+ base_url,
+ index,
+ concurrency=concurrency_list[i],
+ )
)
return stresscli_confs
-def _run_service_test(example, service, test_suite_config):
+def ingest_data_to_db(service, dataset, namespace):
+ """Ingest data into the database."""
+ for service_name in service.get("service_list"):
+ if "data" in service_name:
+ # Ingest data into the database
+ print(f"[OPEA BENCHMARK] 🚀 Ingesting data into the database for {service_name}...")
+ try:
+ svc_ip, port = _get_service_ip(service_name, "k8s", None, None, namespace)
+ url = f"http://{svc_ip}:{port}/v1/dataprep/ingest"
+
+ files = {"files": open(dataset, "rb")}
+
+ response = requests.post(url, files=files)
+ if response.status_code != 200:
+ print(f"Error ingesting data: {response.text}. Status code: {response.status_code}")
+ return False
+ if "Data preparation succeeded" not in response.text:
+ print(f"Error ingesting data: {response.text}. Response: {response}")
+ return False
+
+ except Exception as e:
+ print(f"Error ingesting data: {e}")
+ return False
+ print(f"[OPEA BENCHMARK] 🚀 Data ingestion completed for {service_name}.")
+ break
+ return True
+
+
+def clear_db(service, namespace):
+ """Delete all files from the database."""
+ for service_name in service.get("service_list"):
+ if "data" in service_name:
+ # Delete data from the database
+ try:
+ svc_ip, port = _get_service_ip(service_name, "k8s", None, None, namespace)
+ url = f"http://{svc_ip}:{port}/v1/dataprep/delete"
+ data = {"file_path": "all"}
+ print(f"[OPEA BENCHMARK] 🚀 Deleting data from the database for {service_name} with {url}")
+
+ response = requests.post(url, json=data, headers={"Content-Type": "application/json"})
+ if response.status_code != 200:
+ print(f"Error deleting data: {response.text}. Status code: {response.status_code}")
+ return False
+ if "true" not in response.text:
+ print(f"Error deleting data: {response.text}. Response: {response}")
+ return False
+ except Exception as e:
+ print(f"Error deleting data: {e}")
+ return False
+ print(f"[OPEA BENCHMARK] 🚀 Data deletion completed for {service_name}.")
+ break
+ return True
+
+
+def _run_service_test(example, service, test_suite_config, namespace):
"""Run the test for a specific service and example."""
print(f"[OPEA BENCHMARK] 🚀 Example: [ {example} ] Service: [ {service.get('service_name')} ], Running test...")
@@ -251,44 +309,94 @@ def _run_service_test(example, service, test_suite_config):
run_yaml_path = stresscli_conf["run_yaml_path"]
print(f"[OPEA BENCHMARK] 🚀 The {index} time test is running, run yaml: {run_yaml_path}...")
os.environ["MAX_TOKENS"] = str(service.get("max_output"))
+
+ dataset = None
if stresscli_conf.get("envs") is not None:
for key, value in stresscli_conf.get("envs").items():
os.environ[key] = value
+ if key == "DATASET":
+ dataset = value
+
+ if dataset:
+ # Ingest data into the database for single run of benchmark
+ result = ingest_data_to_db(service, dataset, namespace)
+ if not result:
+ print(f"[OPEA BENCHMARK] 🚀 Data ingestion failed for {service_name}.")
+ exit(1)
+ else:
+ print(f"[OPEA BENCHMARK] 🚀 Dataset is not specified for {service_name}. Check the benchmark.yaml again.")
+
+ # Run the benchmark test and append the output folder to the list
+ print("[OPEA BENCHMARK] 🚀 Start locust_runtests at", datetime.now().strftime("%Y%m%d_%H%M%S"))
+ locust_output = locust_runtests(None, run_yaml_path)
+ print(f"[OPEA BENCHMARK] 🚀 locust_output origin name is {locust_output}")
+ # Rename the output folder to include the index
+ new_output_path = os.path.join(
+ os.path.dirname(run_yaml_path), f"{os.path.splitext(os.path.basename(run_yaml_path))[0]}_output"
+ )
+ os.rename(locust_output, new_output_path)
+ print(f"[OPEA BENCHMARK] 🚀 locust new_output_path is {new_output_path}")
+
+ output_folders.append(new_output_path)
+ print("[OPEA BENCHMARK] 🚀 End locust_runtests at", datetime.now().strftime("%Y%m%d_%H%M%S"))
- output_folders.append(locust_runtests(None, run_yaml_path))
+ # Delete all files from the database after the test
+ result = clear_db(service, namespace)
+ print("[OPEA BENCHMARK] 🚀 End of clean up db", datetime.now().strftime("%Y%m%d_%H%M%S"))
+ if not result:
+ print(f"[OPEA BENCHMARK] 🚀 Data deletion failed for {service_name}.")
+ exit(1)
print(f"[OPEA BENCHMARK] 🚀 Test completed for {service_name} at {url}")
return output_folders
-def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, report=False):
+def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model=None, report=False, output_dir=None):
+ """Run the benchmark test for the specified helm chart and configuration.
+
+ Args:
+ benchmark_config (dict): The benchmark configuration.
+ chart_name (str): The name of the helm chart.
+ namespace (str): The namespace to deploy the chart.
+ node_num (int): The number of nodes of current deployment.
+ llm_model (str): The LLM model to use for the test.
+ report (bool): Whether to generate a report after the test.
+ output_dir (str): Directory to store the test output. If None, uses default directory.
+ """
# If llm_model is None or an empty string, set to default value
if not llm_model:
- llm_model = "Qwen/Qwen2.5-Coder-7B-Instruct"
+ llm_model = "meta-llama/Meta-Llama-3-8B-Instruct"
# Extract data
parsed_data = construct_benchmark_config(benchmark_config)
test_suite_config = {
"user_queries": parsed_data["user_queries"], # num of user queries
"random_prompt": False, # whether to use random prompt, set to False by default
- "run_time": "60m", # The max total run time for the test suite, set to 60m by default
+ "run_time": "30m", # The max total run time for the test suite, set to 60m by default
"collect_service_metric": False, # whether to collect service metrics, set to False by default
"llm_model": llm_model, # The LLM model used for the test
"deployment_type": "k8s", # Default is "k8s", can also be "docker"
"service_ip": None, # Leave as None for k8s, specify for Docker
"service_port": None, # Leave as None for k8s, specify for Docker
- "test_output_dir": os.getcwd() + "/benchmark_output", # The directory to store the test output
+ "test_output_dir": (
+ output_dir if output_dir else os.getcwd() + "/benchmark_output"
+ ), # Use output_dir if provided
+ "node_num": node_num,
"load_shape": {
- "name": "constant",
- "params": {"constant": {"concurrent_level": 4}, "poisson": {"arrival_rate": 1.0}},
+ "name": parsed_data["load_shape_type"],
+ "params": {
+ "poisson": {"arrival_rate": parsed_data["poisson_arrival_rate"]},
+ },
},
- "concurrent_level": 4,
- "arrival_rate": 1.0,
+ "concurrency": parsed_data["concurrency"],
+ "arrival_rate": parsed_data["poisson_arrival_rate"],
"query_timeout": 120,
"warm_ups": parsed_data["warmup_iterations"],
"seed": parsed_data["seed"],
"namespace": namespace,
- "test_cases": parsed_data["test_cases"],
+ "bench_target": parsed_data["bench_target"],
+ "dataset": parsed_data["dataset"],
+ "prompt": parsed_data["prompt"],
"llm_max_token_size": parsed_data["llm_max_token_size"],
}
@@ -313,15 +421,14 @@ def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, repor
"chatqna-retriever-usvc",
"chatqna-tei",
"chatqna-teirerank",
- "chatqna-tgi",
+ "chatqna-vllm",
],
- "test_cases": parsed_data["test_cases"],
# Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
"prompts": query_data,
"max_output": llm_max_token, # max number of output tokens
"k": 1, # number of retrieved documents
}
- output_folder = _run_service_test(chart_name, case_data, test_suite_config)
+ output_folder = _run_service_test(chart_name, case_data, test_suite_config, namespace)
print(f"[OPEA BENCHMARK] 🚀 Test Finished. Output saved in {output_folder}.")
@@ -339,5 +446,5 @@ def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, repor
if __name__ == "__main__":
- benchmark_config = load_yaml("./benchmark.yaml")
- run_benchmark(benchmark_config=benchmark_config, chart_name="chatqna", namespace="deploy-benchmark")
+ benchmark_config = load_yaml("./ChatQnA/benchmark_chatqna.yaml")
+ run_benchmark(benchmark_config=benchmark_config, chart_name="chatqna", namespace="benchmark")
diff --git a/deploy.py b/deploy.py
index 21dd278cc2..bd3a8a87d5 100644
--- a/deploy.py
+++ b/deploy.py
@@ -49,12 +49,14 @@ def configure_replica(values, deploy_config):
return values
-def get_output_filename(num_nodes, with_rerank, example_type, device, action_type):
+def get_output_filename(num_nodes, with_rerank, example_type, device, action_type, batch_size=None):
"""Generate output filename based on configuration."""
rerank_suffix = "with-rerank-" if with_rerank else ""
action_suffix = "deploy-" if action_type == 0 else "update-" if action_type == 1 else ""
+ # Only include batch_suffix if batch_size is not None
+ batch_suffix = f"batch{batch_size}-" if batch_size else ""
- return f"{example_type}-{num_nodes}-{device}-{action_suffix}{rerank_suffix}values.yaml"
+ return f"{example_type}-{rerank_suffix}{device}-{action_suffix}node{num_nodes}-{batch_suffix}values.yaml"
def configure_resources(values, deploy_config):
@@ -62,30 +64,31 @@ def configure_resources(values, deploy_config):
resource_configs = []
for service_name, config in deploy_config["services"].items():
+ # Skip if resources configuration doesn't exist or is not enabled
+ resources_config = config.get("resources", {})
+ if not resources_config:
+ continue
+
resources = {}
- if deploy_config["device"] == "gaudi" and config.get("cards_per_instance", 0) > 1:
+ if deploy_config["device"] == "gaudi" and resources_config.get("cards_per_instance", 0) > 1:
resources = {
- "limits": {"habana.ai/gaudi": config["cards_per_instance"]},
- "requests": {"habana.ai/gaudi": config["cards_per_instance"]},
+ "limits": {"habana.ai/gaudi": resources_config["cards_per_instance"]},
+ "requests": {"habana.ai/gaudi": resources_config["cards_per_instance"]},
}
else:
- limits = {}
- requests = {}
-
- # Only add CPU if cores_per_instance has a value
- if config.get("cores_per_instance"):
- limits["cpu"] = config["cores_per_instance"]
- requests["cpu"] = config["cores_per_instance"]
-
- # Only add memory if memory_capacity has a value
- if config.get("memory_capacity"):
- limits["memory"] = config["memory_capacity"]
- requests["memory"] = config["memory_capacity"]
-
- # Only create resources if we have any limits/requests
- if limits and requests:
- resources["limits"] = limits
- resources["requests"] = requests
+ # Only add CPU if cores_per_instance has a valid value
+ cores = resources_config.get("cores_per_instance")
+ if cores is not None and cores != "":
+ resources = {"limits": {"cpu": cores}, "requests": {"cpu": cores}}
+
+ # Only add memory if memory_capacity has a valid value
+ memory = resources_config.get("memory_capacity")
+ if memory is not None and memory != "":
+ if not resources:
+ resources = {"limits": {"memory": memory}, "requests": {"memory": memory}}
+ else:
+ resources["limits"]["memory"] = memory
+ resources["requests"]["memory"] = memory
if resources:
if service_name == "llm":
@@ -116,48 +119,64 @@ def configure_resources(values, deploy_config):
def configure_extra_cmd_args(values, deploy_config):
"""Configure extra command line arguments for services."""
+ batch_size = None
for service_name, config in deploy_config["services"].items():
- extra_cmd_args = []
-
- for param in [
- "max_batch_size",
- "max_input_length",
- "max_total_tokens",
- "max_batch_total_tokens",
- "max_batch_prefill_tokens",
- ]:
- if config.get(param):
- extra_cmd_args.extend([f"--{param.replace('_', '-')}", str(config[param])])
-
- if extra_cmd_args:
- if service_name == "llm":
- engine = config.get("engine", "tgi")
+ if service_name == "llm":
+ extra_cmd_args = []
+ engine = config.get("engine", "tgi")
+ model_params = config.get("model_params", {})
+
+ # Get engine-specific parameters
+ engine_params = model_params.get(engine, {})
+
+ # Get batch parameters and token parameters configuration
+ batch_params = engine_params.get("batch_params", {})
+ token_params = engine_params.get("token_params", {})
+
+ # Get batch size based on engine type
+ if engine == "tgi":
+ batch_size = batch_params.get("max_batch_size")
+ elif engine == "vllm":
+ batch_size = batch_params.get("max_num_seqs")
+ batch_size = batch_size if batch_size and batch_size != "" else None
+
+ # Add all parameters that exist in batch_params
+ for param, value in batch_params.items():
+ if value is not None and value != "":
+ extra_cmd_args.extend([f"--{param.replace('_', '-')}", str(value)])
+
+ # Add all parameters that exist in token_params
+ for param, value in token_params.items():
+ if value is not None and value != "":
+ extra_cmd_args.extend([f"--{param.replace('_', '-')}", str(value)])
+
+ if extra_cmd_args:
if engine not in values:
values[engine] = {}
values[engine]["extraCmdArgs"] = extra_cmd_args
- else:
- if service_name not in values:
- values[service_name] = {}
- values[service_name]["extraCmdArgs"] = extra_cmd_args
+ print(f"extraCmdArgs: {extra_cmd_args}")
- return values
+ return values, batch_size
def configure_models(values, deploy_config):
"""Configure model settings for services."""
for service_name, config in deploy_config["services"].items():
- # Skip if no model_id defined or service is disabled
- if not config.get("model_id") or config.get("enabled") is False:
+ # Get model_id and check if it's valid (not None or empty string)
+ model_id = config.get("model_id")
+ if not model_id or model_id == "" or config.get("enabled") is False:
continue
if service_name == "llm":
# For LLM service, use its engine as the key
+ # Check if engine is valid (not None or empty string)
engine = config.get("engine", "tgi")
- values[engine]["LLM_MODEL_ID"] = config.get("model_id")
+ if engine and engine != "":
+ values[engine]["LLM_MODEL_ID"] = model_id
elif service_name == "tei":
- values[service_name]["EMBEDDING_MODEL_ID"] = config.get("model_id")
+ values[service_name]["EMBEDDING_MODEL_ID"] = model_id
elif service_name == "teirerank":
- values[service_name]["RERANK_MODEL_ID"] = config.get("model_id")
+ values[service_name]["RERANK_MODEL_ID"] = model_id
return values
@@ -209,13 +228,13 @@ def generate_helm_values(example_type, deploy_config, chart_dir, action_type, no
values = configure_rerank(values, with_rerank, deploy_config, example_type, node_selector or {})
values = configure_replica(values, deploy_config)
values = configure_resources(values, deploy_config)
- values = configure_extra_cmd_args(values, deploy_config)
+ values, batch_size = configure_extra_cmd_args(values, deploy_config)
values = configure_models(values, deploy_config)
device = deploy_config.get("device", "unknown")
# Generate and write YAML file
- filename = get_output_filename(num_nodes, with_rerank, example_type, device, action_type)
+ filename = get_output_filename(num_nodes, with_rerank, example_type, device, action_type, batch_size)
yaml_string = yaml.dump(values, default_flow_style=False)
filepath = os.path.join(chart_dir, filename)
@@ -376,12 +395,24 @@ def install_helm_release(release_name, chart_name, namespace, hw_values_file, de
def uninstall_helm_release(release_name, namespace=None):
- """Uninstall a Helm release and clean up resources, optionally delete the namespace if not 'default'."""
+ """Uninstall a Helm release and clean up resources, optionally delete the namespace if not 'default'.
+
+ First checks if the release exists before attempting to uninstall.
+ """
# Default to 'default' namespace if none is specified
if not namespace:
namespace = "default"
try:
+ # Check if the release exists
+ check_command = ["helm", "list", "--namespace", namespace, "--filter", release_name, "--output", "json"]
+ output = run_kubectl_command(check_command)
+ releases = json.loads(output)
+
+ if not releases:
+ print(f"Helm release {release_name} not found in namespace {namespace}. Nothing to uninstall.")
+ return
+
# Uninstall the Helm release
command = ["helm", "uninstall", release_name, "--namespace", namespace]
print(f"Uninstalling Helm release {release_name} in namespace {namespace}...")
@@ -399,6 +430,8 @@ def uninstall_helm_release(release_name, namespace=None):
except subprocess.CalledProcessError as e:
print(f"Error occurred while uninstalling Helm release or deleting namespace: {e}")
+ except json.JSONDecodeError as e:
+ print(f"Error parsing helm list output: {e}")
def update_service(release_name, chart_name, namespace, hw_values_file, deploy_values_file, update_values_file):
@@ -449,7 +482,7 @@ def read_deploy_config(config_path):
return None
-def check_deployment_ready(release_name, namespace, timeout=300, interval=5, logfile="deployment.log"):
+def check_deployment_ready(release_name, namespace, timeout=1000, interval=5, logfile="deployment.log"):
"""Wait until all pods in the deployment are running and ready.
Args:
@@ -586,6 +619,18 @@ def main():
parser.add_argument("--update-service", action="store_true", help="Update the deployment with new configuration.")
parser.add_argument("--check-ready", action="store_true", help="Check if all services in the deployment are ready.")
parser.add_argument("--chart-dir", default=".", help="Path to the untarred Helm chart directory.")
+ parser.add_argument(
+ "--timeout",
+ type=int,
+ default=1000,
+ help="Maximum time to wait for deployment readiness in seconds (default: 1000)",
+ )
+ parser.add_argument(
+ "--interval",
+ type=int,
+ default=5,
+ help="Interval between readiness checks in seconds (default: 5)",
+ )
args = parser.parse_args()
@@ -597,7 +642,7 @@ def main():
clear_labels_from_nodes(args.label, args.node_names)
return
elif args.check_ready:
- is_ready = check_deployment_ready(args.chart_name, args.namespace)
+ is_ready = check_deployment_ready(args.chart_name, args.namespace, args.timeout, args.interval)
return is_ready
elif args.uninstall:
uninstall_helm_release(args.chart_name, args.namespace)
@@ -659,6 +704,7 @@ def main():
update_service(
args.chart_name, args.chart_name, args.namespace, hw_values_file, args.user_values, values_file_path
)
+ print(f"values_file_path: {values_file_path}")
return
except Exception as e:
parser.error(f"Failed to update deployment: {str(e)}")
diff --git a/deploy_and_benchmark.py b/deploy_and_benchmark.py
index 1dc4c4308d..f210f215dc 100644
--- a/deploy_and_benchmark.py
+++ b/deploy_and_benchmark.py
@@ -23,13 +23,14 @@ def read_yaml(file_path):
return None
-def construct_deploy_config(deploy_config, target_node, max_batch_size=None):
- """Construct a new deploy config based on the target node number and optional max_batch_size.
+def construct_deploy_config(deploy_config, target_node, batch_param_value=None, test_mode="oob"):
+ """Construct a new deploy config based on the target node number and optional batch parameter value.
Args:
deploy_config: Original deploy config dictionary
target_node: Target node number to match in the node array
- max_batch_size: Optional specific max_batch_size value to use
+ batch_param_value: Optional specific batch parameter value to use
+ test_mode: Test mode, either 'oob' or 'tune'
Returns:
A new deploy config with single values for node and instance_num
@@ -51,21 +52,79 @@ def construct_deploy_config(deploy_config, target_node, max_batch_size=None):
# Set the single node value
new_config["node"] = target_node
- # Update instance_num for each service based on the same index
- for service_name, service_config in new_config.get("services", {}).items():
+ # First determine which llm replicaCount to use based on teirerank.enabled
+ services = new_config.get("services", {})
+ teirerank_enabled = services.get("teirerank", {}).get("enabled", True)
+
+ # Process each service's configuration
+ for service_name, service_config in services.items():
+ # Handle replicaCount
if "replicaCount" in service_config:
- instance_nums = service_config["replicaCount"]
- if isinstance(instance_nums, list):
- if len(instance_nums) != len(nodes):
+ if service_name == "llm" and isinstance(service_config["replicaCount"], dict):
+ replica_counts = service_config["replicaCount"]
+ service_config["replicaCount"] = (
+ replica_counts["with_teirerank"] if teirerank_enabled else replica_counts["without_teirerank"]
+ )
+
+ if isinstance(service_config["replicaCount"], list):
+ if len(service_config["replicaCount"]) < len(nodes):
raise ValueError(
- f"instance_num array length ({len(instance_nums)}) for service {service_name} "
- f"doesn't match node array length ({len(nodes)})"
+ f"replicaCount array length ({len(service_config['replicaCount'])}) for service {service_name} "
+ f"smaller than node array length ({len(nodes)})"
)
- service_config["replicaCount"] = instance_nums[node_index]
-
- # Update max_batch_size if specified
- if max_batch_size is not None and "llm" in new_config["services"]:
- new_config["services"]["llm"]["max_batch_size"] = max_batch_size
+ service_config["replicaCount"] = service_config["replicaCount"][node_index]
+
+ # Handle resources based on test_mode
+ if "resources" in service_config:
+ resources = service_config["resources"]
+ if test_mode == "tune" or resources.get("enabled", False):
+ # Keep resource configuration but remove enabled field
+ resources.pop("enabled", None)
+ else:
+ # Remove resource configuration in OOB mode when disabled
+ service_config.pop("resources")
+
+ # Handle model parameters for LLM service
+ if service_name == "llm" and "model_params" in service_config:
+ model_params = service_config["model_params"]
+ engine = service_config.get("engine", "tgi")
+
+ # Get engine-specific parameters
+ engine_params = model_params.get(engine, {})
+
+ # Handle batch parameters
+ if "batch_params" in engine_params:
+ batch_params = engine_params["batch_params"]
+ if test_mode == "tune" or batch_params.get("enabled", False):
+ # Keep batch parameters configuration but remove enabled field
+ batch_params.pop("enabled", None)
+
+ # Update batch parameter value if specified
+ if batch_param_value is not None:
+ if engine == "tgi":
+ batch_params["max_batch_size"] = str(batch_param_value)
+ elif engine == "vllm":
+ batch_params["max_num_seqs"] = str(batch_param_value)
+ else:
+ engine_params.pop("batch_params")
+
+ # Handle token parameters
+ if "token_params" in engine_params:
+ token_params = engine_params["token_params"]
+ if test_mode == "tune" or token_params.get("enabled", False):
+ # Keep token parameters configuration but remove enabled field
+ token_params.pop("enabled", None)
+ else:
+ # Remove token parameters in OOB mode when disabled
+ engine_params.pop("token_params")
+
+ # Update model_params with engine-specific parameters only
+ model_params.clear()
+ model_params[engine] = engine_params
+
+ # Remove model_params if empty or if engine_params is empty
+ if not model_params or not engine_params:
+ service_config.pop("model_params")
return new_config
@@ -84,13 +143,18 @@ def pull_helm_chart(chart_pull_url, version, chart_name):
return untar_dir
-def main(yaml_file, target_node=None):
+def main(yaml_file, target_node=None, test_mode="oob"):
"""Main function to process deployment configuration.
Args:
yaml_file: Path to the YAML configuration file
target_node: Optional target number of nodes to deploy. If not specified, will process all nodes.
+ test_mode: Test mode, either "oob" (out of box) or "tune". Defaults to "oob".
"""
+ if test_mode not in ["oob", "tune"]:
+ print("Error: test_mode must be either 'oob' or 'tune'")
+ return None
+
config = read_yaml(yaml_file)
if config is None:
print("Failed to read YAML file.")
@@ -116,7 +180,7 @@ def main(yaml_file, target_node=None):
# Pull the Helm chart
chart_pull_url = f"oci://ghcr.io/opea-project/charts/{chart_name}"
- version = deploy_config.get("version", "1.1.0")
+ version = deploy_config.get("version", "0-latest")
chart_dir = pull_helm_chart(chart_pull_url, version, chart_name)
if not chart_dir:
return
@@ -140,20 +204,61 @@ def main(yaml_file, target_node=None):
continue
try:
- # Process max_batch_sizes
- max_batch_sizes = deploy_config.get("services", {}).get("llm", {}).get("max_batch_size", [])
- if not isinstance(max_batch_sizes, list):
- max_batch_sizes = [max_batch_sizes]
+ # Process batch parameters based on engine type
+ services = deploy_config.get("services", {})
+ llm_config = services.get("llm", {})
+
+ if "model_params" in llm_config:
+ model_params = llm_config["model_params"]
+ engine = llm_config.get("engine", "tgi")
+
+ # Get engine-specific parameters
+ engine_params = model_params.get(engine, {})
+
+ # Handle batch parameters
+ batch_params = []
+ if "batch_params" in engine_params:
+ key = "max_batch_size" if engine == "tgi" else "max_num_seqs"
+ batch_params = engine_params["batch_params"].get(key, [])
+ param_name = key
+
+ if not isinstance(batch_params, list):
+ batch_params = [batch_params]
+
+ # Skip multiple iterations if batch parameter is empty
+ if batch_params == [""] or not batch_params:
+ batch_params = [None]
+ else:
+ batch_params = [None]
+ param_name = "batch_param"
+
+ # Get timeout and interval from deploy config for check-ready
+ timeout = deploy_config.get("timeout", 1000) # default 1000s
+ interval = deploy_config.get("interval", 5) # default 5s
values_file_path = None
- for i, max_batch_size in enumerate(max_batch_sizes):
- print(f"\nProcessing max_batch_size: {max_batch_size}")
+ # Create benchmark output directory
+ benchmark_dir = os.path.join(os.getcwd(), "benchmark_output")
+ os.makedirs(benchmark_dir, exist_ok=True)
+
+ for i, batch_param in enumerate(batch_params):
+ print(f"\nProcessing {test_mode} mode {param_name}: {batch_param}")
+ # Create subdirectory for this iteration with test mode in the name
+ iteration_dir = os.path.join(
+ benchmark_dir,
+ f"benchmark_{test_mode}_node{node}_batch{batch_param if batch_param is not None else 'default'}",
+ )
+ os.makedirs(iteration_dir, exist_ok=True)
# Construct new deploy config
- new_deploy_config = construct_deploy_config(deploy_config, node, max_batch_size)
+ new_deploy_config = construct_deploy_config(deploy_config, node, batch_param, test_mode)
# Write the new deploy config to a temporary file
- temp_config_file = f"temp_deploy_config_{node}_{max_batch_size}.yaml"
+ temp_config_file = (
+ f"temp_deploy_config_{node}.yaml"
+ if batch_param is None
+ else f"temp_deploy_config_{node}_{batch_param}.yaml"
+ )
try:
with open(temp_config_file, "w") as f:
yaml.dump(new_deploy_config, f)
@@ -178,6 +283,8 @@ def main(yaml_file, target_node=None):
if match:
values_file_path = match.group(1)
print(f"Captured values_file_path: {values_file_path}")
+ # Copy values file to iteration directory
+ shutil.copy2(values_file_path, iteration_dir)
else:
print("values_file_path not found in the output")
@@ -198,12 +305,20 @@ def main(yaml_file, target_node=None):
values_file_path,
"--update-service",
]
- result = subprocess.run(cmd, check=True)
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
if result.returncode != 0:
- print(
- f"Update failed for {node} nodes configuration with max_batch_size {max_batch_size}"
- )
- break # Skip remaining max_batch_sizes for this node
+ print(f"Update failed for {node} nodes configuration with {param_name} {batch_param}")
+ break # Skip remaining {param_name} for this node
+
+ # Update values_file_path from the output
+ match = re.search(r"values_file_path: (\S+)", result.stdout)
+ if match:
+ values_file_path = match.group(1)
+ print(f"Updated values_file_path: {values_file_path}")
+ # Copy values file to iteration directory
+ shutil.copy2(values_file_path, iteration_dir)
+ else:
+ print("values_file_path not found in the output")
# Wait for deployment to be ready
print("\nWaiting for deployment to be ready...")
@@ -215,26 +330,42 @@ def main(yaml_file, target_node=None):
"--namespace",
namespace,
"--check-ready",
+ "--timeout",
+ str(timeout),
+ "--interval",
+ str(interval),
]
try:
- result = subprocess.run(cmd, check=True)
- print("Deployments are ready!")
+ result = subprocess.run(
+ cmd, check=False
+ ) # Changed to check=False to handle return code manually
+ if result.returncode == 0:
+ print("Deployments are ready!")
+ # Run benchmark only if deployment is ready
+ run_benchmark(
+ benchmark_config=benchmark_config,
+ chart_name=chart_name,
+ namespace=namespace,
+ node_num=node,
+ llm_model=deploy_config.get("services", {}).get("llm", {}).get("model_id", ""),
+ output_dir=iteration_dir,
+ )
+ else:
+ print(
+ f"Deployments are not ready after timeout period during "
+ f"{'deployment' if i == 0 else 'update'} for {node} nodes. "
+ f"Skipping remaining iterations."
+ )
+ break # Exit the batch parameter loop for current node
except subprocess.CalledProcessError as e:
- print(f"Deployments status failed with returncode: {e.returncode}")
-
- # Run benchmark
- run_benchmark(
- benchmark_config=benchmark_config,
- chart_name=chart_name,
- namespace=namespace,
- llm_model=deploy_config.get("services", {}).get("llm", {}).get("model_id", ""),
- )
+ print(f"Error while checking deployment status: {str(e)}")
+ break # Exit the batch parameter loop for current node
except Exception as e:
print(
- f"Error during {'deployment' if i == 0 else 'update'} for {node} nodes with max_batch_size {max_batch_size}: {str(e)}"
+ f"Error during {'deployment' if i == 0 else 'update'} for {node} nodes with {param_name} {batch_param}: {str(e)}"
)
- break # Skip remaining max_batch_sizes for this node
+ break # Skip remaining {param_name} for this node
finally:
# Clean up the temporary file
if os.path.exists(temp_config_file):
@@ -287,6 +418,7 @@ def main(yaml_file, target_node=None):
parser = argparse.ArgumentParser(description="Deploy and benchmark with specific node configuration.")
parser.add_argument("yaml_file", help="Path to the YAML configuration file")
parser.add_argument("--target-node", type=int, help="Optional: Target number of nodes to deploy.", default=None)
+ parser.add_argument("--test-mode", type=str, help="Test mode, either 'oob' (out of box) or 'tune'.", default="oob")
args = parser.parse_args()
- main(args.yaml_file, args.target_node)
+ main(args.yaml_file, args.target_node, args.test_mode)
diff --git a/requirements.txt b/requirements.txt
index 44f6445aa0..637668c3d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ kubernetes
locust
numpy
opea-eval>=1.2
+prometheus_client
pytest
pyyaml
requests
From d1861f9a45599e83bc8f8804c9c37db087db124f Mon Sep 17 00:00:00 2001
From: alexsin368 <109180236+alexsin368@users.noreply.github.com>
Date: Thu, 27 Feb 2025 21:43:43 -0800
Subject: [PATCH 028/226] Top level README: add link to github.io documentation
(#1584)
Signed-off-by: alexsin368
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
README.md | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 6738cef202..2db55575bd 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,10 @@ GenAIExamples are designed to give developers an easy entry into generative AI,
[GenAIEval](https://github.com/opea-project/GenAIEval) measures service performance metrics such as throughput, latency, and accuracy for GenAIExamples. This feature helps users compare performance across various hardware configurations easily.
+## Documentation
+
+The GenAIExamples [documentation](https://opea-project.github.io/latest/examples/index.html) contains a comprehensive guide on all available examples including architecture, deployment guides, and more. Information on GenAIComps, GenAIInfra, and GenAIEval can also be found there.
+
## Getting Started
GenAIExamples offers flexible deployment options that cater to different user needs, enabling efficient use and deployment in various environments. Here’s a brief overview of the three primary methods: Python startup, Docker Compose, and Kubernetes.
@@ -20,7 +24,7 @@ Users can choose the most suitable approach based on ease of setup, scalability
### Deployment Guide
-Deployment are based on released docker images by default, check [docker image list](./docker_images_list.md) for detailed information. You can also build your own images following instructions.
+Deployment is based on released docker images by default, check [docker image list](./docker_images_list.md) for detailed information. You can also build your own images following instructions.
#### Prerequisite
@@ -43,6 +47,8 @@ Deployment are based on released docker images by default, check [docker image l
#### Deploy Examples
+> **Note**: Check for [sample guides](https://opea-project.github.io/latest/examples/index.html) first for your use case. If it is not available, then refer to the table below.
+
| Use Case | Docker Compose
Deployment on Xeon | Docker Compose
Deployment on Gaudi | Docker Compose
Deployment on ROCm | Kubernetes with Helm Charts | Kubernetes with GMC |
| ----------------- | ------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------------- | ------------------------------------------------------------ |
| ChatQnA | [Xeon Instructions](ChatQnA/docker_compose/intel/cpu/xeon/README.md) | [Gaudi Instructions](ChatQnA/docker_compose/intel/hpu/gaudi/README.md) | [ROCm Instructions](ChatQnA/docker_compose/amd/gpu/rocm/README.md) | [ChatQnA with Helm Charts](ChatQnA/kubernetes/helm/README.md) | [ChatQnA with GMC](ChatQnA/kubernetes/gmc/README.md) |
From a30a6e3c52a639194730c3826ff3b2d0edb80540 Mon Sep 17 00:00:00 2001
From: WenjiaoYue
Date: Fri, 28 Feb 2025 16:10:58 +0800
Subject: [PATCH 029/226] fix click example button issue (#1586)
Signed-off-by: WenjiaoYue
Signed-off-by: Chingis Yundunov
---
AgentQnA/ui/svelte/src/lib/components/home.svelte | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/AgentQnA/ui/svelte/src/lib/components/home.svelte b/AgentQnA/ui/svelte/src/lib/components/home.svelte
index f35ee44575..ba37f1672d 100644
--- a/AgentQnA/ui/svelte/src/lib/components/home.svelte
+++ b/AgentQnA/ui/svelte/src/lib/components/home.svelte
@@ -108,7 +108,7 @@
handleCreate(feature)}
+ on:click={() => handleCreate(feature.description)}
>
Date: Fri, 28 Feb 2025 23:40:31 +0900
Subject: [PATCH 030/226] ChatQnA Docker compose file for Milvus as vdb (#1548)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Ezequiel Lanza
Signed-off-by: Kendall González León
Signed-off-by: chensuyue
Signed-off-by: Spycsh
Signed-off-by: Wang, Xigui
Signed-off-by: ZePan110
Signed-off-by: dependabot[bot]
Signed-off-by: minmin-intel
Signed-off-by: Artem Astafev
Signed-off-by: Xinyao Wang
Signed-off-by: Cathy Zhang
Signed-off-by: letonghan
Signed-off-by: alexsin368
Signed-off-by: WenjiaoYue
Co-authored-by: Ezequiel Lanza
Co-authored-by: Kendall González León
Co-authored-by: chen, suyue
Co-authored-by: Spycsh <39623753+Spycsh@users.noreply.github.com>
Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com>
Co-authored-by: jotpalch <49465120+jotpalch@users.noreply.github.com>
Co-authored-by: ZePan110
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: minmin-intel
Co-authored-by: Ying Hu
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eero Tamminen
Co-authored-by: Liang Lv
Co-authored-by: Artem Astafev
Co-authored-by: XinyaoWa
Co-authored-by: alexsin368 <109180236+alexsin368@users.noreply.github.com>
Co-authored-by: WenjiaoYue
Signed-off-by: Chingis Yundunov
---
.../intel/cpu/xeon/compose_milvus.yaml | 227 +++++
.../docker_compose/intel/cpu/xeon/milvus.yaml | 811 ++++++++++++++++++
ChatQnA/tests/test_compose_milvus_on_xeon.sh | 249 ++++++
3 files changed, 1287 insertions(+)
create mode 100644 ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
create mode 100644 ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
create mode 100644 ChatQnA/tests/test_compose_milvus_on_xeon.sh
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
new file mode 100644
index 0000000000..740f5eba42
--- /dev/null
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -0,0 +1,227 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+services:
+ etcd:
+ container_name: milvus-etcd
+ image: quay.io/coreos/etcd:v3.5.5
+ environment:
+ - ETCD_AUTO_COMPACTION_MODE=revision
+ - ETCD_AUTO_COMPACTION_RETENTION=1000
+ - ETCD_QUOTA_BACKEND_BYTES=4294967296
+ - ETCD_SNAPSHOT_COUNT=50000
+ volumes:
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+ command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+ healthcheck:
+ test: ["CMD", "etcdctl", "endpoint", "health"]
+ interval: 30s
+ timeout: 20s
+ retries: 3
+
+ minio:
+ container_name: milvus-minio
+ image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+ environment:
+ MINIO_ACCESS_KEY: minioadmin
+ MINIO_SECRET_KEY: minioadmin
+ ports:
+ - "${MINIO_PORT1:-5044}:9001"
+ - "${MINIO_PORT2:-5043}:9000"
+ volumes:
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+ command: minio server /minio_data --console-address ":9001"
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+ interval: 30s
+ timeout: 20s
+ retries: 3
+
+ milvus-standalone:
+ container_name: milvus-standalone
+ image: milvusdb/milvus:v2.4.6
+ command: ["milvus", "run", "standalone"]
+ security_opt:
+ - seccomp:unconfined
+ environment:
+ ETCD_ENDPOINTS: etcd:2379
+ MINIO_ADDRESS: minio:9000
+ volumes:
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/milvus.yaml:/milvus/configs/milvus.yaml
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+ interval: 30s
+ start_period: 90s
+ timeout: 20s
+ retries: 3
+ ports:
+ - "19530:19530"
+ - "${MILVUS_STANDALONE_PORT:-9091}:9091"
+ depends_on:
+ - "etcd"
+ - "minio"
+
+ dataprep-milvus-service:
+ image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+ container_name: dataprep-milvus-server
+ ports:
+ - "${DATAPREP_PORT:-11101}:5000"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MILVUS"
+ MILVUS_HOST: ${host_ip}
+ MILVUS_PORT: 19530
+ TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
+ LOGFLAG: ${LOGFLAG}
+ restart: unless-stopped
+ depends_on:
+ milvus-standalone:
+ condition: service_healthy
+ etcd:
+ condition: service_healthy
+ minio:
+ condition: service_healthy
+
+ retriever:
+ image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+ container_name: retriever-milvus-server
+ depends_on:
+ - milvus-standalone
+ ports:
+ - "7000:7000"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ MILVUS_HOST: ${host_ip}
+ MILVUS_PORT: 19530
+ TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LOGFLAG: ${LOGFLAG}
+ RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_MILVUS"
+ restart: unless-stopped
+
+ tei-embedding-service:
+ image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+ container_name: tei-embedding-server
+ ports:
+ - "6006:80"
+ volumes:
+ - "./data:/data"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+
+ tei-reranking-service:
+ image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+ container_name: tei-reranking-server
+ ports:
+ - "8808:80"
+ volumes:
+ - "./data:/data"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ container_name: vllm-service
+ ports:
+ - "9009:80"
+ volumes:
+ - "./data:/data"
+ shm_size: 128g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+
+ chatqna-xeon-backend-server:
+ image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+ container_name: chatqna-xeon-backend-server
+ depends_on:
+ - milvus-standalone
+ - tei-embedding-service
+ - dataprep-milvus-service
+ - retriever
+ - tei-reranking-service
+ - vllm-service
+ ports:
+ - "8888:8888"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server
+ - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+ - RETRIEVER_SERVICE_HOST_IP=retriever
+ - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
+ - RERANK_SERVER_HOST_IP=tei-reranking-service
+ - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
+ - LLM_SERVER_HOST_IP=vllm-service
+ - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
+ - LLM_MODEL=${LLM_MODEL_ID}
+ - LOGFLAG=${LOGFLAG}
+ ipc: host
+ restart: always
+
+ chatqna-xeon-ui-server:
+ image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+ container_name: chatqna-xeon-ui-server
+ depends_on:
+ - chatqna-xeon-backend-server
+ ports:
+ - "5173:5173"
+ ipc: host
+ restart: always
+
+ chatqna-xeon-nginx-server:
+ image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+ container_name: chatqna-xeon-nginx-server
+ depends_on:
+ - chatqna-xeon-backend-server
+ - chatqna-xeon-ui-server
+ ports:
+ - "${NGINX_PORT:-80}:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server
+ - FRONTEND_SERVICE_PORT=5173
+ - BACKEND_SERVICE_NAME=chatqna
+ - BACKEND_SERVICE_IP=chatqna-xeon-backend-server
+ - BACKEND_SERVICE_PORT=8888
+ - DATAPREP_SERVICE_IP=dataprep-milvus-service
+ - DATAPREP_SERVICE_PORT=5000
+ ipc: host
+ restart: always
+
+
+networks:
+ default:
+ driver: bridge
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
new file mode 100644
index 0000000000..b9f22cb3d1
--- /dev/null
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
@@ -0,0 +1,811 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed to the LF AI & Data foundation under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Related configuration of etcd, used to store Milvus metadata & service discovery.
+etcd:
+ endpoints: localhost:2379
+ rootPath: by-dev # The root path where data is stored in etcd
+ metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
+ kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
+ log:
+ level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
+ # path is one of:
+ # - "default" as os.Stderr,
+ # - "stderr" as os.Stderr,
+ # - "stdout" as os.Stdout,
+ # - file path to append server logs to.
+ # please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log
+ path: stdout
+ ssl:
+ enabled: false # Whether to support ETCD secure connection mode
+ tlsCert: /path/to/etcd-client.pem # path to your cert file
+ tlsKey: /path/to/etcd-client-key.pem # path to your key file
+ tlsCACert: /path/to/ca.pem # path to your CACert file
+ # TLS min version
+ # Optional values: 1.0, 1.1, 1.2, 1.3。
+ # We recommend using version 1.2 and above.
+ tlsMinVersion: 1.3
+ requestTimeout: 10000 # Etcd operation timeout in milliseconds
+ use:
+ embed: false # Whether to enable embedded Etcd (an in-process EtcdServer).
+ data:
+ dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/
+ auth:
+ enabled: false # Whether to enable authentication
+ userName: # username for etcd authentication
+ password: # password for etcd authentication
+
+metastore:
+ type: etcd # Default value: etcd, Valid values: [etcd, tikv]
+
+# Related configuration of tikv, used to store Milvus metadata.
+# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery.
+# TiKV is a good option when the metadata size requires better horizontal scalability.
+tikv:
+ endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd.
+ rootPath: by-dev # The root path where data is stored in tikv
+ metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
+ kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
+ requestTimeout: 10000 # ms, tikv request timeout
+ snapshotScanSize: 256 # batch size of tikv snapshot scan
+ ssl:
+ enabled: false # Whether to support TiKV secure connection mode
+ tlsCert: # path to your cert file
+ tlsKey: # path to your key file
+ tlsCACert: # path to your CACert file
+
+localStorage:
+ path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/
+
+# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus.
+# We refer to the storage service as MinIO/S3 in the following description for simplicity.
+minio:
+ address: localhost # Address of MinIO/S3
+ port: 9000 # Port of MinIO/S3
+ accessKeyID: minioadmin # accessKeyID of MinIO/S3
+ secretAccessKey: minioadmin # MinIO/S3 encryption string
+ useSSL: false # Access to MinIO/S3 with SSL
+ ssl:
+ tlsCACert: /path/to/public.crt # path to your CACert file
+ bucketName: a-bucket # Bucket name in MinIO/S3
+ rootPath: files # The root path where the message is stored in MinIO/S3
+ # Whether to useIAM role to access S3/GCS instead of access/secret keys
+ # For more information, refer to
+ # aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html
+ # gcp: https://cloud.google.com/storage/docs/access-control/iam
+ # aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control
+ # aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role
+ useIAM: false
+ # Cloud Provider of S3. Supports: "aws", "gcp", "aliyun".
+ # You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio
+ # You can use "gcp" for other cloud provider supports S3 API with signature v2
+ # You can use "aliyun" for other cloud provider uses virtual host style bucket
+ # When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now
+ cloudProvider: aws
+ # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws".
+ # Leave it empty if you want to use AWS default endpoint
+ iamEndpoint:
+ logLevel: fatal # Log level for aws sdk log. Supported level: off, fatal, error, warn, info, debug, trace
+ region: # Specify minio storage system location region
+ useVirtualHost: false # Whether use virtual host mode for bucket
+ requestTimeoutMs: 10000 # minio timeout for request time in milliseconds
+ # The maximum number of objects requested per batch in minio ListObjects rpc,
+ # 0 means using oss client by default, decrease these configuration if ListObjects timeout
+ listObjectsMaxKeys: 0
+
+# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka.
+# You can change your mq by setting mq.type field.
+# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file.
+# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka
+# 2. cluster mode: Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode)
+mq:
+ # Default value: "default"
+ # Valid values: [default, pulsar, kafka, rocksmq, natsmq]
+ type: default
+ enablePursuitMode: true # Default value: "true"
+ pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds
+ pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes
+ mqBufSize: 16 # MQ client consumer buffer length
+ dispatcher:
+ mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge
+ targetBufSize: 16 # the length of channel buffer for targe
+ maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack
+
+# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services.
+pulsar:
+ address: localhost # Address of pulsar
+ port: 6650 # Port of Pulsar
+ webport: 80 # Web port of pulsar, if you connect directly without proxy, should use 8080
+ maxMessageSize: 5242880 # 5 * 1024 * 1024 Bytes, Maximum size of each message in pulsar.
+ tenant: public
+ namespace: default
+ requestTimeout: 60 # pulsar client global request timeout in seconds
+ enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path.
+
+# If you want to enable kafka, needs to comment the pulsar configs
+# kafka:
+# brokerList:
+# saslUsername:
+# saslPassword:
+# saslMechanisms:
+# securityProtocol:
+# ssl:
+# enabled: false # whether to enable ssl mode
+# tlsCert: # path to client's public key (PEM) used for authentication
+# tlsKey: # path to client's private key (PEM) used for authentication
+# tlsCaCert: # file or directory path to CA certificate(s) for verifying the broker's key
+# tlsKeyPassword: # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any
+# readTimeout: 10
+
+rocksmq:
+ # The path where the message is stored in rocksmq
+ # please adjust in embedded Milvus: /tmp/milvus/rdb_data
+ path: /var/lib/milvus/rdb_data
+ lrucacheratio: 0.06 # rocksdb cache memory ratio
+ rocksmqPageSize: 67108864 # 64 MB, 64 * 1024 * 1024 bytes, The size of each page of messages in rocksmq
+ retentionTimeInMinutes: 4320 # 3 days, 3 * 24 * 60 minutes, The retention time of the message in rocksmq.
+ retentionSizeInMB: 8192 # 8 GB, 8 * 1024 MB, The retention size of the message in rocksmq.
+ compactionInterval: 86400 # 1 day, trigger rocksdb compaction every day to remove deleted data
+ compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level.
+
+# natsmq configuration.
+# more detail: https://docs.nats.io/running-a-nats-service/configuration
+natsmq:
+ server:
+ port: 4222 # Port for nats server listening
+ storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats
+ maxFileStore: 17179869184 # Maximum size of the 'file' storage
+ maxPayload: 8388608 # Maximum number of bytes in a message payload
+ maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections
+ initializeTimeout: 4000 # waiting for initialization of natsmq finished
+ monitor:
+ trace: false # If true enable protocol trace log messages
+ debug: false # If true enable debug log messages
+ logTime: true # If set to false, log without timestamps.
+ logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path
+ logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one
+ retention:
+ maxAge: 4320 # Maximum age of any message in the P-channel
+ maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
+ maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
+
+# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests
+rootCoord:
+ dmlChannelNum: 16 # The number of dml channels created at system startup
+ maxPartitionNum: 1024 # Maximum number of partitions in a collection
+ minSegmentSizeToEnableIndex: 1024 # It's a threshold. When the segment size is less than this value, the segment will not be indexed
+ enableActiveStandby: false
+ maxDatabaseNum: 64 # Maximum number of database
+ maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber
+ gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
+ ip: # if not specified, use the first unicastable address
+ port: 53100
+ grpc:
+ serverMaxSendSize: 536870912
+ serverMaxRecvSize: 268435456
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 536870912
+
+# Related configuration of proxy, used to validate client requests and reduce the returned results.
+proxy:
+ timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
+ healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
+ msgStream:
+ timeTick:
+ bufSize: 512
+ maxNameLength: 255 # Maximum length of name for a collection or alias
+ # Maximum number of fields in a collection.
+ # As of today (2.2.0 and after) it is strongly DISCOURAGED to set maxFieldNum >= 64.
+ # So adjust at your risk!
+ maxFieldNum: 64
+ maxVectorFieldNum: 4 # Maximum number of vector fields in a collection.
+ maxShardNum: 16 # Maximum number of shards in a collection
+ maxDimension: 32768 # Maximum dimension of a vector
+ # Whether to produce gin logs.\n
+ # please adjust in embedded Milvus: false
+ ginLogging: true
+ ginLogSkipPaths: / # skip url path for gin log
+ maxTaskNum: 1024 # max task number of proxy task queue
+ mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection
+ accessLog:
+ enable: false # if use access log
+ minioEnable: false # if upload sealed access log file to minio
+ localPath: /tmp/milvus_access
+ filename: # Log filename, leave empty to use stdout.
+ maxSize: 64 # Max size for a single file, in MB.
+ cacheSize: 10240 # Size of log of memory cache, in B
+ rotatedTime: 0 # Max time for single access log file in seconds
+ remotePath: access_log/ # File path in minIO
+ remoteMaxTime: 0 # Max time for log file in minIO, in hours
+ formatters:
+ base:
+ format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]"
+ query:
+ format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]"
+ methods: "Query,Search,Delete"
+ connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info
+ connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds
+ maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos
+ gracefulStopTimeout: 30 # seconds. force stop node without graceful stop
+ slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds.
+ http:
+ enabled: true # Whether to enable the http server
+ debug_mode: false # Whether to enable http server debug mode
+ port: # high-level restful api
+ acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64
+ enablePprof: true # Whether to enable pprof middleware on the metrics port
+ ip: # if not specified, use the first unicastable address
+ port: 19530
+ internalPort: 19529
+ grpc:
+ serverMaxSendSize: 268435456
+ serverMaxRecvSize: 67108864
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 67108864
+
+# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments.
+queryCoord:
+ taskMergeCap: 1
+ taskExecutionCap: 256
+ autoHandoff: true # Enable auto handoff
+ autoBalance: true # Enable auto balance
+ autoBalanceChannel: true # Enable auto balance channel
+ balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes
+ globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes
+ scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance
+ reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance
+ overloadedMemoryThresholdPercentage: 90 # The threshold percentage that memory overload
+ balanceIntervalSeconds: 60
+ memoryUsageMaxDifferencePercentage: 30
+ rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes
+ segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes
+ globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes
+ segmentCountMaxSteps: 50 # segment count based plan generator max steps
+ rowCountMaxSteps: 50 # segment count based plan generator max steps
+ randomMaxSteps: 10 # segment count based plan generator max steps
+ growingRowCountWeight: 4 # the memory weight of growing segment row count
+ balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed
+ checkSegmentInterval: 1000
+ checkChannelInterval: 1000
+ checkBalanceInterval: 10000
+ checkIndexInterval: 10000
+ channelTaskTimeout: 60000 # 1 minute
+ segmentTaskTimeout: 120000 # 2 minute
+ distPullInterval: 500
+ collectionObserverInterval: 200
+ checkExecutedFlagInterval: 100
+ heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available
+ loadTimeoutSeconds: 600
+ distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds
+ heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds
+ checkHandoffInterval: 5000
+ enableActiveStandby: false
+ checkInterval: 1000
+ checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node
+ checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node
+ brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout
+ collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it
+ observerTaskParallel: 16 # the parallel observer dispatcher task number
+ checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
+ checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session
+ gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
+ enableStoppingBalance: true # whether enable stopping balance
+ channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode
+ cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds
+ ip: # if not specified, use the first unicastable address
+ port: 19531
+ grpc:
+ serverMaxSendSize: 536870912
+ serverMaxRecvSize: 268435456
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 536870912
+
+# Related configuration of queryNode, used to run hybrid search between vector and scalar data.
+queryNode:
+ stats:
+ publishInterval: 1000 # Interval for querynode to report node information (milliseconds)
+ segcore:
+ knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]).
+ chunkRows: 128 # The number of vectors in a chunk.
+ interimIndex:
+ enableIndex: true # Enable segment build with index to accelerate vector search when segment is in growing or binlog.
+ nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8
+ nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist
+ memExpansionRate: 1.15 # extra memory needed by building interim index
+ buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
+ knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
+ loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
+ enableDisk: false # enable querynode load disk index, and search on disk index
+ maxDiskUsagePercentage: 95
+ cache:
+ enabled: true
+ memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024
+ readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed`
+ # options: async, sync, disable.
+ # Specifies the necessity for warming up the chunk cache.
+ # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the
+ # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency
+ # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage;
+ # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query.
+ warmup: disable
+ mmap:
+ mmapEnabled: false # Enable mmap for loading data
+ lazyload:
+ enabled: false # Enable lazyload for loading data
+ waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve
+ requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default
+ requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default
+ maxRetryTimes: 1 # max retry times for lazy load, 1 by default
+ maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default
+ grouping:
+ enabled: true
+ maxNQ: 1000
+ topKMergeRatio: 20
+ scheduler:
+ receiveChanSize: 10240
+ unsolvedQueueSize: 10240
+ # maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task).
+ # Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio.
+ # It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2.
+ # Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100.
+ # (0, 100]
+ maxReadConcurrentRatio: 1
+ cpuRatio: 10 # ratio used to estimate read task cpu usage.
+ maxTimestampLag: 86400
+ scheduleReadPolicy:
+ # fifo: A FIFO queue support the schedule.
+ # user-task-polling:
+ # The user's tasks will be polled one by one and scheduled.
+ # Scheduling is fair on task granularity.
+ # The policy is based on the username for authentication.
+ # And an empty username is considered the same user.
+ # When there are no multi-users, the policy decay into FIFO"
+ name: fifo
+ taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty
+ enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other)
+ maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler
+ dataSync:
+ flowGraph:
+ maxQueueLength: 16 # Maximum length of task queue in flowgraph
+ maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
+ enableSegmentPrune: false # use partition prune function on shard delegator
+ ip: # if not specified, use the first unicastable address
+ port: 21123
+ grpc:
+ serverMaxSendSize: 536870912
+ serverMaxRecvSize: 268435456
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 536870912
+
+indexCoord:
+ bindIndexNodeMode:
+ enable: false
+ address: localhost:22930
+ withCred: false
+ nodeID: 0
+ segment:
+ minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed
+
+indexNode:
+ scheduler:
+ buildParallel: 1
+ enableDisk: true # enable index node build disk vector index
+ maxDiskUsagePercentage: 95
+ ip: # if not specified, use the first unicastable address
+ port: 21121
+ grpc:
+ serverMaxSendSize: 536870912
+ serverMaxRecvSize: 268435456
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 536870912
+
+dataCoord:
+ channel:
+ watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer.
+ balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch
+ legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels
+ balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing
+ balanceInterval: 360 # The interval with which the channel manager check dml channel balance status
+ checkInterval: 1 # The interval in seconds with which the channel manager advances channel states
+ notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds).
+ segment:
+ maxSize: 1024 # Maximum size of a segment in MB
+ diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index
+ sealProportion: 0.12
+ assignmentExpiration: 2000 # The time of the assignment expiration in ms
+ allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart
+ maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60
+ # If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than
+ # minSizeFromIdleToSealed, Milvus will automatically seal it.
+ # The max idle time of segment in seconds, 10*60.
+ maxIdleTime: 600
+ minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed.
+ # The max number of binlog file for one segment, the segment will be sealed if
+ # the number of binlog file reaches to max value.
+ maxBinlogFileNumber: 32
+ smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than
+ # (smallProportion * segment max # of rows).
+ # A compaction will happen on small segments if the segment after compaction will have
+ compactableProportion: 0.85
+ # over (compactableProportion * segment max # of rows) rows.
+ # MUST BE GREATER THAN OR EQUAL TO !!!
+ # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%.
+ expansionRate: 1.25
+ autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version
+ enableCompaction: true # Enable data segment compaction
+ compaction:
+ enableAutoCompaction: true
+ indexBasedCompaction: true
+ rpcTimeout: 10
+ maxParallelTaskNum: 10
+ workerMaxParallelTaskNum: 2
+ levelzero:
+ forceTrigger:
+ minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB
+ maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB
+ deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction
+ deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30
+ enableGarbageCollection: true
+ gc:
+ interval: 3600 # gc interval in seconds
+ missingTolerance: 86400 # file meta missing tolerance duration in seconds, default to 24hr(1d)
+ dropTolerance: 10800 # file belongs to dropped entity tolerance duration in seconds. 3600
+ removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects
+ scanInterval: 168 # garbage collection scan residue interval in hours
+ enableActiveStandby: false
+ brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout
+ autoBalance: true # Enable auto balance
+ checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
+ import:
+ filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task.
+ taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state.
+ maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask).
+ scheduleInterval: 2 # The interval for scheduling import, measured in seconds.
+ checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker.
+ checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker.
+ maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request.
+ waitForIndex: true # Indicates whether the import operation waits for the completion of index building.
+ gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
+ ip: # if not specified, use the first unicastable address
+ port: 13333
+ grpc:
+ serverMaxSendSize: 536870912
+ serverMaxRecvSize: 268435456
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 536870912
+
+dataNode:
+ dataSync:
+ flowGraph:
+ maxQueueLength: 16 # Maximum length of task queue in flowgraph
+ maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
+ maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally
+ skipMode:
+ enable: true # Support skip some timetick message to reduce CPU usage
+ skipNum: 4 # Consume one for every n records skipped
+ coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds
+ segment:
+ insertBufSize: 16777216 # Max buffer size to flush for a single segment.
+ deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB
+ syncPeriod: 600 # The period to sync segments if buffer is not empty.
+ memory:
+ forceSyncEnable: true # Set true to force sync if memory usage is too high
+ forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced.
+ checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds
+ forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced.
+ timetick:
+ byRPC: true
+ interval: 500
+ channel:
+ # specify the size of global work pool of all channels
+ # if this parameter <= 0, will set it as the maximum number of CPUs that can be executing
+ # suggest to set it bigger on large collection numbers to avoid blocking
+ workPoolSize: -1
+ # specify the size of global work pool for channel checkpoint updating
+ # if this parameter <= 0, will set it as 10
+ updateChannelCheckpointMaxParallel: 10
+ updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel
+ updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call
+ maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC.
+ channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates.
+ import:
+ maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode.
+ maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files.
+ readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import.
+ compaction:
+ levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode
+ gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop
+ ip: # if not specified, use the first unicastable address
+ port: 21124
+ grpc:
+ serverMaxSendSize: 536870912
+ serverMaxRecvSize: 268435456
+ clientMaxSendSize: 268435456
+ clientMaxRecvSize: 536870912
+
+# Configures the system log output.
+log:
+ level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
+ file:
+ rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
+ maxSize: 300 # MB
+ maxAge: 10 # Maximum time for log retention in day.
+ maxBackups: 20
+ format: text # text or json
+ stdout: true # Stdout enable or not
+
+grpc:
+ log:
+ level: WARNING
+ gracefulStopTimeout: 10 # second, time to wait graceful stop finish
+ client:
+ compressionEnabled: false
+ dialTimeout: 200
+ keepAliveTime: 10000
+ keepAliveTimeout: 20000
+ maxMaxAttempts: 10
+ initialBackoff: 0.2
+ maxBackoff: 10
+ minResetInterval: 1000
+ maxCancelError: 32
+ minSessionCheckInterval: 200
+
+# Configure the proxy tls enable.
+tls:
+ serverPemPath: configs/cert/server.pem
+ serverKeyPath: configs/cert/server.key
+ caPemPath: configs/cert/ca.pem
+
+common:
+ defaultPartitionName: _default # default partition name for a collection
+ defaultIndexName: _default_idx # default index name
+ entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire
+ indexSliceSize: 16 # MB
+ threadCoreCoefficient:
+ highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool
+ middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool
+ lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool
+ buildIndexThreadPoolRatio: 0.75
+ DiskIndex:
+ MaxDegree: 56
+ SearchListSize: 100
+ PQCodeBudgetGBRatio: 0.125
+ BuildNumThreadsRatio: 1
+ SearchCacheBudgetGBRatio: 0.1
+ LoadNumThreadRatio: 8
+ BeamWidthRatio: 4
+ gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency.
+ gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time.
+ storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead
+ # Default value: auto
+ # Valid values: [auto, avx512, avx2, avx, sse4_2]
+ # This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building.
+ simdType: auto
+ security:
+ authorizationEnabled: false
+ # The superusers will ignore some system check processes,
+ # like the old password verification when updating the credential
+ superUsers:
+ tlsMode: 0
+ session:
+ ttl: 30 # ttl value when session granting a lease to register service
+ retryTimes: 30 # retry times when session sending etcd requests
+ locks:
+ metrics:
+ enable: false # whether gather statistics for metrics locks
+ threshold:
+ info: 500 # minimum milliseconds for printing durations in info level
+ warn: 1000 # minimum milliseconds for printing durations in warn level
+ storage:
+ scheme: s3
+ enablev2: false
+ ttMsgEnabled: true # Whether the instance disable sending ts messages
+ traceLogMode: 0 # trace request info
+ bloomFilterSize: 100000 # bloom filter initial size
+ maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter
+
+# QuotaConfig, configurations of Milvus quota and limits.
+# By default, we enable:
+# 1. TT protection;
+# 2. Memory protection.
+# 3. Disk quota protection.
+# You can enable:
+# 1. DML throughput limitation;
+# 2. DDL, DQL qps/rps limitation;
+# 3. DQL Queue length/latency protection;
+# 4. DQL result rate protection;
+# If necessary, you can also manually force to deny RW requests.
+quotaAndLimits:
+ enabled: true # `true` to enable quota and limits, `false` to disable.
+ # quotaCenterCollectInterval is the time interval that quotaCenter
+ # collects metrics from Proxies, Query cluster and Data cluster.
+ # seconds, (0 ~ 65536)
+ quotaCenterCollectInterval: 3
+ ddl:
+ enabled: false
+ collectionRate: -1 # qps, default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
+ partitionRate: -1 # qps, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
+ db:
+ collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
+ partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
+ indexRate:
+ enabled: false
+ max: -1 # qps, default no limit, rate for CreateIndex, DropIndex
+ db:
+ max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex
+ flushRate:
+ enabled: true
+ max: -1 # qps, default no limit, rate for flush
+ collection:
+ max: 0.1 # qps, default no limit, rate for flush at collection level.
+ db:
+ max: -1 # qps of db level, default no limit, rate for flush
+ compactionRate:
+ enabled: false
+ max: -1 # qps, default no limit, rate for manualCompaction
+ db:
+ max: -1 # qps of db level, default no limit, rate for manualCompaction
+ dml:
+ # dml limit rates, default no limit.
+ # The maximum rate will not be greater than max.
+ enabled: false
+ insertRate:
+ max: -1 # MB/s, default no limit
+ db:
+ max: -1 # MB/s, default no limit
+ collection:
+ max: -1 # MB/s, default no limit
+ partition:
+ max: -1 # MB/s, default no limit
+ upsertRate:
+ max: -1 # MB/s, default no limit
+ db:
+ max: -1 # MB/s, default no limit
+ collection:
+ max: -1 # MB/s, default no limit
+ partition:
+ max: -1 # MB/s, default no limit
+ deleteRate:
+ max: -1 # MB/s, default no limit
+ db:
+ max: -1 # MB/s, default no limit
+ collection:
+ max: -1 # MB/s, default no limit
+ partition:
+ max: -1 # MB/s, default no limit
+ bulkLoadRate:
+ max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate
+ db:
+ max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate
+ collection:
+ max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate
+ partition:
+ max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate
+ dql:
+ # dql limit rates, default no limit.
+ # The maximum rate will not be greater than max.
+ enabled: false
+ searchRate:
+ max: -1 # vps (vectors per second), default no limit
+ db:
+ max: -1 # vps (vectors per second), default no limit
+ collection:
+ max: -1 # vps (vectors per second), default no limit
+ partition:
+ max: -1 # vps (vectors per second), default no limit
+ queryRate:
+ max: -1 # qps, default no limit
+ db:
+ max: -1 # qps, default no limit
+ collection:
+ max: -1 # qps, default no limit
+ partition:
+ max: -1 # qps, default no limit
+ limits:
+ maxCollectionNum: 65536
+ maxCollectionNumPerDB: 65536
+ maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit
+ maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes
+ limitWriting:
+ # forceDeny false means dml requests are allowed (except for some
+ # specific conditions, such as memory of nodes to water marker), true means always reject all dml requests.
+ forceDeny: false
+ ttProtection:
+ enabled: false
+ # maxTimeTickDelay indicates the backpressure for DML Operations.
+ # DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
+ # if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected.
+ # seconds
+ maxTimeTickDelay: 300
+ memProtection:
+ # When memory usage > memoryHighWaterLevel, all dml requests would be rejected;
+ # When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate;
+ # When memory usage < memoryLowWaterLevel, no action.
+ enabled: true
+ dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes
+ dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes
+ queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes
+ queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes
+ growingSegmentsSizeProtection:
+ # No action will be taken if the growing segments size is less than the low watermark.
+ # When the growing segments size exceeds the low watermark, the dml rate will be reduced,
+ # but the rate will not be lower than minRateRatio * dmlRate.
+ enabled: false
+ minRateRatio: 0.5
+ lowWaterLevel: 0.2
+ highWaterLevel: 0.4
+ diskProtection:
+ enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected;
+ diskQuota: -1 # MB, (0, +inf), default no limit
+ diskQuotaPerDB: -1 # MB, (0, +inf), default no limit
+ diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit
+ diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit
+ limitReading:
+ # forceDeny false means dql requests are allowed (except for some
+ # specific conditions, such as collection has been dropped), true means always reject all dql requests.
+ forceDeny: false
+ queueProtection:
+ enabled: false
+ # nqInQueueThreshold indicated that the system was under backpressure for Search/Query path.
+ # If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off
+ # until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1.
+ # int, default no limit
+ nqInQueueThreshold: -1
+ # queueLatencyThreshold indicated that the system was under backpressure for Search/Query path.
+ # If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off
+ # until the latency of queuing no longer exceeds queueLatencyThreshold.
+ # The latency here refers to the averaged latency over a period of time.
+ # milliseconds, default no limit
+ queueLatencyThreshold: -1
+ resultProtection:
+ enabled: false
+ # maxReadResultRate indicated that the system was under backpressure for Search/Query path.
+ # If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off
+ # until the read result rate no longer exceeds maxReadResultRate.
+ # MB/s, default no limit
+ maxReadResultRate: -1
+ maxReadResultRatePerDB: -1
+ maxReadResultRatePerCollection: -1
+ # colOffSpeed is the speed of search&query rates cool off.
+ # (0, 1]
+ coolOffSpeed: 0.9
+
+trace:
+ # trace exporter type, default is stdout,
+ # optional values: ['noop','stdout', 'jaeger', 'otlp']
+ exporter: noop
+ # fraction of traceID based sampler,
+ # optional values: [0, 1]
+ # Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
+ sampleFraction: 0
+ jaeger:
+ url: # when exporter is jaeger should set the jaeger's URL
+ otlp:
+ endpoint: # example: "127.0.0.1:4318"
+ secure: true
+
+#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation.
+#here, you can set the size of the memory occupied by the memory pool, with the unit being MB.
+#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize.
+#if initMemSize and MaxMemSize both set zero,
+#milvus will automatically initialize half of the available GPU memory,
+#maxMemSize will the whole available GPU memory.
+gpu:
+ initMemSize: # Gpu Memory Pool init size
+ maxMemSize: # Gpu Memory Pool Max size
diff --git a/ChatQnA/tests/test_compose_milvus_on_xeon.sh b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
new file mode 100644
index 0000000000..d2953a9992
--- /dev/null
+++ b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export host_ip=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ git clone https://github.com/vllm-project/vllm.git && cd vllm
+ VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null
+ # make sure NOT change the pwd
+ cd ../
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+
+ docker images && sleep 1s
+}
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ export no_proxy=${no_proxy},${ip_address}
+ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+ export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+ export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export LOGFLAG=true
+
+ # Start Docker Containers
+ docker compose -f compose_milvus.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+ n=0
+ until [[ "$n" -ge 100 ]]; do
+ docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+ if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
+ break
+ fi
+ sleep 5s
+ n=$((n+1))
+ done
+}
+
+function validate_service() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+ cd $LOG_PATH
+ HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+ elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+ HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+ else
+ HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+ fi
+ HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+ RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+
+ # check response status
+ if [ "$HTTP_STATUS" -ne "200" ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ exit 1
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+ fi
+ echo "Response"
+ echo $RESPONSE_BODY
+ echo "Expected Result"
+ echo $EXPECTED_RESULT
+ # check response body
+ if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+ exit 1
+ else
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ fi
+
+ sleep 1s
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # tei for embedding service
+ validate_service \
+ "${ip_address}:6006/embed" \
+ "[[" \
+ "tei-embedding" \
+ "tei-embedding-server" \
+ '{"inputs":"What is Deep Learning?"}'
+
+ sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+ # test /v1/dataprep/ingest upload file
+ echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+ validate_service \
+ "http://${ip_address}:11101/v1/dataprep/ingest" \
+ "Data preparation succeeded" \
+ "dataprep_upload_file" \
+ "dataprep-milvus-server"
+
+ # test /v1/dataprep/delete
+ validate_service \
+ "http://${ip_address}:11101/v1/dataprep/delete" \
+ '{"status":true}' \
+ "dataprep_del" \
+ "dataprep-milvus-server"
+
+ # test /v1/dataprep/delete
+ validate_service \
+ "http://${ip_address}:11101/v1/dataprep/delete" \
+ '{"status":true}' \
+ "dataprep_del" \
+ "dataprep-milvus-server"
+
+
+ # retrieval microservice
+ test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+ validate_service \
+ "${ip_address}:7000/v1/retrieval" \
+ " " \
+ "retrieval" \
+ "retriever-milvus-server" \
+ "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+ # tei for rerank microservice
+ echo "Validating reranking service"
+ validate_service \
+ "${ip_address}:8808/rerank" \
+ '{"index":1,"score":' \
+ "tei-rerank" \
+ "tei-reranking-server" \
+ '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+
+ # tgi for llm service
+ echo "Validating llm service"
+ validate_service \
+ "${ip_address}:9009/v1/chat/completions" \
+ "content" \
+ "vllm-llm" \
+ "vllm-service" \
+ '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+}
+
+function validate_megaservice() {
+ # Curl the Mega Service
+ validate_service \
+ "${ip_address}:8888/v1/chatqna" \
+ "data: " \
+ "chatqna-megaservice" \
+ "chatqna-xeon-backend-server" \
+ '{"messages": "What is the revenue of Nike in 2023?"}'
+
+}
+
+function validate_frontend() {
+ echo "[ TEST INFO ]: --------- frontend test started ---------"
+ cd $WORKPATH/ui/svelte
+ local conda_env_name="OPEA_e2e"
+ export PATH=${HOME}/miniforge3/bin/:$PATH
+ if conda info --envs | grep -q "$conda_env_name"; then
+ echo "$conda_env_name exist!"
+ else
+ conda create -n ${conda_env_name} python=3.12 -y
+ fi
+ source activate ${conda_env_name}
+ echo "[ TEST INFO ]: --------- conda env activated ---------"
+
+ sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+ conda install -c conda-forge nodejs=22.6.0 -y
+ npm install && npm ci && npx playwright install --with-deps
+ node -v && npm -v && pip list
+
+ exit_status=0
+ npx playwright test || exit_status=$?
+
+ if [ $exit_status -ne 0 ]; then
+ echo "[TEST INFO]: ---------frontend test failed---------"
+ exit $exit_status
+ else
+ echo "[TEST INFO]: ---------frontend test passed---------"
+ fi
+}
+
+function stop_docker() {
+ echo "In stop docker"
+ echo $WORKPATH
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ docker compose -f compose_milvus.yaml down
+}
+
+function main() {
+
+ stop_docker
+
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+
+ start_time=$(date +%s)
+ start_services
+ end_time=$(date +%s)
+ duration=$((end_time-start_time))
+ echo "Mega service start duration is $duration s" && sleep 1s
+
+ validate_microservices
+ echo "==== microservices validated ===="
+ validate_megaservice
+ echo "==== megaservice validated ===="
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
From 3460a380b62485303d2f4e46e44127d7305553f4 Mon Sep 17 00:00:00 2001
From: "chen, suyue"
Date: Mon, 3 Mar 2025 08:45:10 +0800
Subject: [PATCH 031/226] Fix cd workflow condition (#1588)
Fix cd workflow condition
Signed-off-by: chensuyue
Co-authored-by: ZePan110
Signed-off-by: Chingis Yundunov
---
.github/workflows/manual-example-workflow.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/manual-example-workflow.yml b/.github/workflows/manual-example-workflow.yml
index 9e119dcf7a..6fde18ddce 100644
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -76,7 +76,7 @@ jobs:
build-deploy-gmc:
needs: [get-test-matrix]
- if: ${{ fromJSON(inputs.deploy_gmc) }} && ${{ fromJSON(needs.get-test-matrix.outputs.nodes).length != 0 }}
+ if: ${{ fromJSON(inputs.deploy_gmc) }}
strategy:
matrix:
node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
@@ -90,7 +90,7 @@ jobs:
run-examples:
needs: [get-test-matrix, build-deploy-gmc]
- if: always() && ${{ fromJSON(needs.get-test-matrix.outputs.examples).length != 0 }}
+ if: always()
strategy:
matrix:
example: ${{ fromJson(needs.get-test-matrix.outputs.examples) }}
From fc75a8c7a18062706dec7e907bdb22fa461fdffa Mon Sep 17 00:00:00 2001
From: Ying Hu
Date: Mon, 3 Mar 2025 16:17:19 +0800
Subject: [PATCH 032/226] Update DBQnA tgi docker image to latest tgi 2.4.0
(#1593)
Signed-off-by: Chingis Yundunov
---
DBQnA/docker_compose/intel/cpu/xeon/README.md | 37 ++++++++++++-------
.../intel/cpu/xeon/compose.yaml | 4 +-
.../docker_compose/intel/cpu/xeon/set_env.sh | 27 ++++++++++++++
DBQnA/tests/test_compose_on_xeon.sh | 3 ++
4 files changed, 55 insertions(+), 16 deletions(-)
create mode 100644 DBQnA/docker_compose/intel/cpu/xeon/set_env.sh
diff --git a/DBQnA/docker_compose/intel/cpu/xeon/README.md b/DBQnA/docker_compose/intel/cpu/xeon/README.md
index 78d5b60419..26b46ec4b9 100644
--- a/DBQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/DBQnA/docker_compose/intel/cpu/xeon/README.md
@@ -38,25 +38,29 @@ We set default model as "mistralai/Mistral-7B-Instruct-v0.3", change "LLM_MODEL_
If use gated models, you also need to provide [huggingface token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
+```bash
+export HUGGINGFACEHUB_API_TOKEN="xxx"
+```
+
### 2.1 Setup Environment Variables
Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
```bash
-# your_ip should be your external IP address, do not use localhost.
-export your_ip=$(hostname -I | awk '{print $1}')
+# host_ip should be your external IP address, do not use localhost.
+export host_ip=$(hostname -I | awk '{print $1}')
# Example: no_proxy="localhost,127.0.0.1,192.168.1.1"
-export no_proxy=${your_no_proxy},${your_ip}
+export no_proxy=${no_proxy},${host_ip}
# If you are in a proxy environment, also set the proxy-related environment variables:
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
+export http_proxy=${http_proxy}
+export https_proxy=${https_proxy}
# Set other required variables
export TGI_PORT=8008
-export TGI_LLM_ENDPOINT=http://${your_ip}:${TGI_PORT}
+export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_PORT}
export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
export POSTGRES_USER=postgres
@@ -65,7 +69,14 @@ export POSTGRES_DB=chinook
export text2sql_port=9090
```
-Note: Please replace with `your_ip` with your external IP address, do not use localhost.
+or
+edit the file set_env.sh to set those environment variables,
+
+```bash
+source set_env.sh
+```
+
+Note: Please replace with `host_ip` with your external IP address, do not use localhost.
### 2.2 Start Microservice Docker Containers
@@ -120,7 +131,7 @@ docker run -d --name="test-dbqna-react-ui-server" --ipc=host -p 5174:80 -e no_pr
```bash
-curl http://${your_ip}:$TGI_PORT/generate \
+curl http://${host_ip}:$TGI_PORT/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
@@ -133,17 +144,17 @@ Once Text-to-SQL microservice is started, user can use below command
#### 3.2.1 Test the Database connection
```bash
-curl --location http://${your_ip}:9090/v1/postgres/health \
+curl --location http://${host_ip}:9090/v1/postgres/health \
--header 'Content-Type: application/json' \
- --data '{"user": "'${POSTGRES_USER}'","password": "'${POSTGRES_PASSWORD}'","host": "'${your_ip}'", "port": "5442", "database": "'${POSTGRES_DB}'"}'
+ --data '{"user": "'${POSTGRES_USER}'","password": "'${POSTGRES_PASSWORD}'","host": "'${host_ip}'", "port": "5442", "database": "'${POSTGRES_DB}'"}'
```
#### 3.2.2 Invoke the microservice.
```bash
-curl http://${your_ip}:9090/v1/text2sql\
+curl http://${host_ip}:9090/v1/text2sql\
-X POST \
- -d '{"input_text": "Find the total number of Albums.","conn_str": {"user": "'${POSTGRES_USER}'","password": "'${POSTGRES_PASSWORD}'","host": "'${your_ip}'", "port": "5442", "database": "'${POSTGRES_DB}'"}}' \
+ -d '{"input_text": "Find the total number of Albums.","conn_str": {"user": "'${POSTGRES_USER}'","password": "'${POSTGRES_PASSWORD}'","host": "'${host_ip}'", "port": "5442", "database": "'${POSTGRES_DB}'"}}' \
-H 'Content-Type: application/json'
```
@@ -161,7 +172,7 @@ npm run test
## 🚀 Launch the React UI
-Open this URL `http://{your_ip}:5174` in your browser to access the frontend.
+Open this URL `http://{host_ip}:5174` in your browser to access the frontend.

diff --git a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 9b2bcbfbaa..6654bc535d 100644
--- a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -1,11 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-version: "3.8"
-
services:
tgi-service:
- image: ghcr.io/huggingface/text-generation-inference:2.1.0
+ image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-service
ports:
- "8008:80"
diff --git a/DBQnA/docker_compose/intel/cpu/xeon/set_env.sh b/DBQnA/docker_compose/intel/cpu/xeon/set_env.sh
new file mode 100644
index 0000000000..beae6d5bc9
--- /dev/null
+++ b/DBQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+pushd "../../../../../" > /dev/null
+source .set_env.sh
+popd > /dev/null
+
+#export host_ip=$(hostname -I | awk '{print $1}')
+
+if [ -z "${HUGGINGFACEHUB_API_TOKEN}" ]; then
+ echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set HUGGINGFACEHUB_API_TOKEN."
+fi
+
+if [ -z "${host_ip}" ]; then
+ echo "Error: host_ip is not set. Please set host_ip first."
+fi
+export no_proxy=$no_proxy,$host_ip,dbqna-xeon-react-ui-server,text2sql-service,tgi-service,postgres-container
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export TGI_PORT=8008
+export TGI_LLM_ENDPOINT="http://${host_ip}:${TGI_PORT}"
+export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
+export POSTGRES_USER=postgres
+export POSTGRES_PASSWORD=testpwd
+export POSTGRES_DB=chinook
+export TEXT2SQL_PORT=9090
+"set_env.sh" 27L, 974B
diff --git a/DBQnA/tests/test_compose_on_xeon.sh b/DBQnA/tests/test_compose_on_xeon.sh
index e9a50cf0e7..8775fc79dc 100755
--- a/DBQnA/tests/test_compose_on_xeon.sh
+++ b/DBQnA/tests/test_compose_on_xeon.sh
@@ -22,6 +22,9 @@ function build_docker_images() {
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+ docker images && sleep 1s
}
function start_service() {
From 2b701cad4f0e96904bc4dc57a807109a9a7b197b Mon Sep 17 00:00:00 2001
From: Spycsh <39623753+Spycsh@users.noreply.github.com>
Date: Mon, 3 Mar 2025 23:03:44 +0800
Subject: [PATCH 033/226] Revert chatqna async and enhance tests (#1598)
align with opea-project/GenAIComps#1354
Signed-off-by: Chingis Yundunov
---
ChatQnA/chatqna.py | 4 ++--
ChatQnA/tests/test_compose_on_gaudi.sh | 2 +-
ChatQnA/tests/test_compose_on_rocm.sh | 6 +++---
ChatQnA/tests/test_compose_on_xeon.sh | 3 ++-
4 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index e25ab4d39a..afb9706cb2 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -166,10 +166,10 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
return next_data
-async def align_generator(self, gen, **kwargs):
+def align_generator(self, gen, **kwargs):
# OpenAI response format
# b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
- async for line in gen:
+ for line in gen:
line = line.decode("utf-8")
start = line.find("{")
end = line.rfind("}") + 1
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index 2785995bbb..e1a37707e1 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -137,7 +137,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data:" \
+ "Nike" \
"mega-chatqna" \
"chatqna-gaudi-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh
index d6dc5dfae1..ebfd9562a4 100644
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -207,7 +207,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"chatqna-megaservice" \
"chatqna-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
@@ -263,8 +263,8 @@ function main() {
echo "==== microservices validated ===="
validate_megaservice
echo "==== megaservice validated ===="
- validate_frontend
- echo "==== frontend validated ===="
+ # validate_frontend
+ # echo "==== frontend validated ===="
stop_docker
echo y | docker system prune
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
index 69df81a0e8..a61fbf11bb 100644
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -101,6 +101,7 @@ function validate_service() {
function validate_microservices() {
# Check if the microservices are running correctly.
+ sleep 10m
# tei for embedding service
validate_service \
@@ -142,7 +143,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data" \
+ "Nike" \
"mega-chatqna" \
"chatqna-xeon-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
From 12845f1f751e6b1b82daee90e5097d870b8a03c5 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Tue, 4 Mar 2025 09:48:27 +0800
Subject: [PATCH 034/226] Use model cache for docker compose test (#1582)
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
.github/workflows/_example-workflow.yml | 9 +++--
.github/workflows/_run-docker-compose.yml | 6 ++++
.github/workflows/manual-example-workflow.yml | 35 +++++++++++--------
.github/workflows/pr-docker-compose-e2e.yml | 1 +
.../intel/cpu/xeon/compose.yaml | 2 +-
.../intel/cpu/xeon/compose_multilang.yaml | 2 +-
.../intel/hpu/gaudi/compose.yaml | 2 +-
AudioQnA/tests/test_compose_on_gaudi.sh | 1 +
AudioQnA/tests/test_compose_on_xeon.sh | 1 +
.../intel/cpu/xeon/compose.yaml | 2 +-
DBQnA/tests/test_compose_on_xeon.sh | 1 +
.../intel/cpu/xeon/compose.yaml | 4 +--
.../cpu/xeon/compose_without_rerank.yaml | 2 +-
.../intel/hpu/gaudi/compose.yaml | 4 +--
.../tests/test_compose_on_gaudi.sh | 1 +
.../tests/test_compose_on_xeon.sh | 1 +
.../test_compose_without_rerank_on_xeon.sh | 1 +
.../intel/cpu/xeon/compose.yaml | 2 +-
FaqGen/tests/test_compose_on_gaudi.sh | 2 +-
FaqGen/tests/test_compose_on_xeon.sh | 1 +
.../intel/cpu/xeon/compose.yaml | 2 +-
.../intel/hpu/gaudi/compose.yaml | 2 +-
Translation/tests/test_compose_on_gaudi.sh | 1 +
Translation/tests/test_compose_on_xeon.sh | 1 +
.../intel/cpu/xeon/compose.yaml | 2 +-
.../intel/hpu/gaudi/compose.yaml | 2 +-
VisualQnA/tests/test_compose_on_gaudi.sh | 1 +
VisualQnA/tests/test_compose_on_xeon.sh | 1 +
28 files changed, 61 insertions(+), 31 deletions(-)
diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
index d79c4132e1..010eece64a 100644
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -43,7 +43,11 @@ on:
inject_commit:
default: false
required: false
- type: string
+ type: boolean
+ use_model_cache:
+ default: false
+ required: false
+ type: boolean
jobs:
####################################################################################################
@@ -110,6 +114,7 @@ jobs:
tag: ${{ inputs.tag }}
example: ${{ inputs.example }}
hardware: ${{ inputs.node }}
+ use_model_cache: ${{ inputs.use_model_cache }}
secrets: inherit
@@ -131,7 +136,7 @@ jobs:
####################################################################################################
test-gmc-pipeline:
needs: [build-images]
- if: ${{ fromJSON(inputs.test_gmc) }}
+ if: false # ${{ fromJSON(inputs.test_gmc) }}
uses: ./.github/workflows/_gmc-e2e.yml
with:
example: ${{ inputs.example }}
diff --git a/.github/workflows/_run-docker-compose.yml b/.github/workflows/_run-docker-compose.yml
index 54ec72eea3..3d02b7b4ae 100644
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -28,6 +28,10 @@ on:
required: false
type: string
default: ""
+ use_model_cache:
+ required: false
+ type: boolean
+ default: false
jobs:
get-test-case:
runs-on: ubuntu-latest
@@ -144,9 +148,11 @@ jobs:
example: ${{ inputs.example }}
hardware: ${{ inputs.hardware }}
test_case: ${{ matrix.test_case }}
+ use_model_cache: ${{ inputs.use_model_cache }}
run: |
cd ${{ github.workspace }}/$example/tests
if [[ "$IMAGE_REPO" == "" ]]; then export IMAGE_REPO="${OPEA_IMAGE_REPO}opea"; fi
+ if [[ "$use_model_cache" == "true" ]]; then export model_cache="/data2/hf_model"; fi
if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi
- name: Clean up container after test
diff --git a/.github/workflows/manual-example-workflow.yml b/.github/workflows/manual-example-workflow.yml
index 6fde18ddce..3a98b3d40e 100644
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -20,11 +20,11 @@ on:
description: "Tag to apply to images"
required: true
type: string
- deploy_gmc:
- default: false
- description: 'Whether to deploy gmc'
- required: true
- type: boolean
+ # deploy_gmc:
+ # default: false
+ # description: 'Whether to deploy gmc'
+ # required: true
+ # type: boolean
build:
default: true
description: 'Build test required images for Examples'
@@ -40,11 +40,11 @@ on:
description: 'Test examples with helm charts'
required: false
type: boolean
- test_gmc:
- default: false
- description: 'Test examples with gmc'
- required: false
- type: boolean
+ # test_gmc:
+ # default: false
+ # description: 'Test examples with gmc'
+ # required: false
+ # type: boolean
opea_branch:
default: "main"
description: 'OPEA branch for image build'
@@ -54,7 +54,12 @@ on:
default: false
description: "inject commit to docker images true or false"
required: false
- type: string
+ type: boolean
+ use_model_cache:
+ default: false
+ description: "use model cache true or false"
+ required: false
+ type: boolean
permissions: read-all
jobs:
@@ -76,7 +81,8 @@ jobs:
build-deploy-gmc:
needs: [get-test-matrix]
- if: ${{ fromJSON(inputs.deploy_gmc) }}
+ if: false
+ #${{ fromJSON(inputs.deploy_gmc) }}
strategy:
matrix:
node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
@@ -89,7 +95,7 @@ jobs:
secrets: inherit
run-examples:
- needs: [get-test-matrix, build-deploy-gmc]
+ needs: [get-test-matrix] #[get-test-matrix, build-deploy-gmc]
if: always()
strategy:
matrix:
@@ -104,7 +110,8 @@ jobs:
build: ${{ fromJSON(inputs.build) }}
test_compose: ${{ fromJSON(inputs.test_compose) }}
test_helmchart: ${{ fromJSON(inputs.test_helmchart) }}
- test_gmc: ${{ fromJSON(inputs.test_gmc) }}
+ # test_gmc: ${{ fromJSON(inputs.test_gmc) }}
opea_branch: ${{ inputs.opea_branch }}
inject_commit: ${{ inputs.inject_commit }}
+ use_model_cache: ${{ inputs.use_model_cache }}
secrets: inherit
diff --git a/.github/workflows/pr-docker-compose-e2e.yml b/.github/workflows/pr-docker-compose-e2e.yml
index c924f0e26a..a7604f29af 100644
--- a/.github/workflows/pr-docker-compose-e2e.yml
+++ b/.github/workflows/pr-docker-compose-e2e.yml
@@ -42,5 +42,6 @@ jobs:
tag: "ci"
example: ${{ matrix.example }}
hardware: ${{ matrix.hardware }}
+ use_model_cache: true
diff_excluded_files: '\.github|\.md|\.txt|kubernetes|gmc|assets|benchmark'
secrets: inherit
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 78a17dda04..48756c00b6 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -30,7 +30,7 @@ services:
ports:
- "3006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
index d83e1002c0..c6ad650943 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "3006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 2624dbf531..45691f478b 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -40,7 +40,7 @@ services:
ports:
- "3006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/AudioQnA/tests/test_compose_on_gaudi.sh b/AudioQnA/tests/test_compose_on_gaudi.sh
index 2eb0bf3408..fe5cff379a 100644
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/AudioQnA/tests/test_compose_on_xeon.sh b/AudioQnA/tests/test_compose_on_xeon.sh
index 48047948cc..11a86ba5c8 100644
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 6654bc535d..8e4c15bd6b 100644
--- a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/DBQnA/tests/test_compose_on_xeon.sh b/DBQnA/tests/test_compose_on_xeon.sh
index 8775fc79dc..da9fa1b71a 100755
--- a/DBQnA/tests/test_compose_on_xeon.sh
+++ b/DBQnA/tests/test_compose_on_xeon.sh
@@ -10,6 +10,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
index 9624df7300..6ecebfdc23 100644
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
@@ -38,7 +38,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -96,7 +96,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index 68afbf18e7..edc563cdbe 100644
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -36,7 +36,7 @@ services:
ports:
- "6006:80"
volumes:
- - "/mnt/models:/data"
+ - "${MODEL_CACHE:-/mnt/models}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
index eedbe66719..f47d01a7cf 100644
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -34,7 +34,7 @@ services:
ports:
- "8090:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -95,7 +95,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/DocIndexRetriever/tests/test_compose_on_gaudi.sh b/DocIndexRetriever/tests/test_compose_on_gaudi.sh
index 2176caf638..d6dd8a7138 100644
--- a/DocIndexRetriever/tests/test_compose_on_gaudi.sh
+++ b/DocIndexRetriever/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/DocIndexRetriever/tests/test_compose_on_xeon.sh b/DocIndexRetriever/tests/test_compose_on_xeon.sh
index 1e490a517d..0027ebe9a3 100644
--- a/DocIndexRetriever/tests/test_compose_on_xeon.sh
+++ b/DocIndexRetriever/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh b/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh
index ddd62ebd8a..16aed41242 100644
--- a/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh
+++ b/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
index a20c784786..ea24486cda 100644
--- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/FaqGen/tests/test_compose_on_gaudi.sh b/FaqGen/tests/test_compose_on_gaudi.sh
index 8726c0f027..eeba304279 100644
--- a/FaqGen/tests/test_compose_on_gaudi.sh
+++ b/FaqGen/tests/test_compose_on_gaudi.sh
@@ -13,7 +13,7 @@ export TAG=${IMAGE_TAG}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
-export DATA_PATH="/data/cache"
+export DATA_PATH=${model_cache:-"/data/cache"}
function build_docker_images() {
opea_branch=${opea_branch:-"main"}
diff --git a/FaqGen/tests/test_compose_on_xeon.sh b/FaqGen/tests/test_compose_on_xeon.sh
index 9d494234a0..cc527b7e9d 100755
--- a/FaqGen/tests/test_compose_on_xeon.sh
+++ b/FaqGen/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/Translation/docker_compose/intel/cpu/xeon/compose.yaml b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
index d876f99f2a..d1a6ee337d 100644
--- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
@@ -21,7 +21,7 @@ services:
timeout: 10s
retries: 100
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
diff --git a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
index be983b7b13..7e49db9c39 100644
--- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -30,7 +30,7 @@ services:
- SYS_NICE
ipc: host
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
llm:
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
diff --git a/Translation/tests/test_compose_on_gaudi.sh b/Translation/tests/test_compose_on_gaudi.sh
index a4a201a762..63167b6e74 100644
--- a/Translation/tests/test_compose_on_gaudi.sh
+++ b/Translation/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/Translation/tests/test_compose_on_xeon.sh b/Translation/tests/test_compose_on_xeon.sh
index ed085b842a..9e2ac58cb7 100644
--- a/Translation/tests/test_compose_on_xeon.sh
+++ b/Translation/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 89525bd65d..4a81704be4 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8399:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index fa17cf36d1..73e2747085 100644
--- a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8399:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/VisualQnA/tests/test_compose_on_gaudi.sh b/VisualQnA/tests/test_compose_on_gaudi.sh
index 19dc07fcfb..3515be94e4 100644
--- a/VisualQnA/tests/test_compose_on_gaudi.sh
+++ b/VisualQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/VisualQnA/tests/test_compose_on_xeon.sh b/VisualQnA/tests/test_compose_on_xeon.sh
index 9ab2e281f5..4e345b3f91 100644
--- a/VisualQnA/tests/test_compose_on_xeon.sh
+++ b/VisualQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
From aef57f6664461fed510a4aaf5333a172b1d83d97 Mon Sep 17 00:00:00 2001
From: "chen, suyue"
Date: Tue, 4 Mar 2025 10:41:22 +0800
Subject: [PATCH 035/226] open chatqna frontend test (#1594)
Signed-off-by: chensuyue
Signed-off-by: Chingis Yundunov
---
ChatQnA/tests/test_compose_on_gaudi.sh | 2 +-
ChatQnA/tests/test_compose_on_rocm.sh | 4 ++--
ChatQnA/tests/test_compose_on_xeon.sh | 6 ++----
3 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index e1a37707e1..d9b40529e8 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -189,7 +189,7 @@ function main() {
validate_microservices
validate_megaservice
- # validate_frontend
+ validate_frontend
stop_docker
echo y | docker system prune
diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh
index ebfd9562a4..732e2684aa 100644
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -263,8 +263,8 @@ function main() {
echo "==== microservices validated ===="
validate_megaservice
echo "==== megaservice validated ===="
- # validate_frontend
- # echo "==== frontend validated ===="
+ validate_frontend
+ echo "==== frontend validated ===="
stop_docker
echo y | docker system prune
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
index a61fbf11bb..e3cd8db1d1 100644
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -101,7 +101,7 @@ function validate_service() {
function validate_microservices() {
# Check if the microservices are running correctly.
- sleep 10m
+ sleep 3m
# tei for embedding service
validate_service \
@@ -111,8 +111,6 @@ function validate_microservices() {
"tei-embedding-server" \
'{"inputs":"What is Deep Learning?"}'
- sleep 1m # retrieval can't curl as expected, try to wait for more time
-
# retrieval microservice
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
validate_service \
@@ -196,7 +194,7 @@ function main() {
validate_microservices
validate_megaservice
- # validate_frontend
+ validate_frontend
stop_docker
echo y | docker system prune
From bd0996c936075d44cba88253ec2ceb03815534cd Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Tue, 4 Mar 2025 16:10:20 +0800
Subject: [PATCH 036/226] Enable CodeGen,CodeTrans and DocSum model cache for
docker compose test. (#1599)
1.Add cache path check
2.Enable CodeGen,CodeTrans and DocSum model cache for docker compose test.
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
.github/workflows/_run-docker-compose.yml | 9 ++++++++-
CodeGen/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
CodeGen/tests/test_compose_on_gaudi.sh | 1 +
CodeGen/tests/test_compose_on_xeon.sh | 1 +
CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
CodeTrans/tests/test_compose_on_gaudi.sh | 1 +
CodeTrans/tests/test_compose_on_xeon.sh | 1 +
DocSum/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
DocSum/tests/test_compose_on_gaudi.sh | 2 +-
DocSum/tests/test_compose_on_xeon.sh | 1 +
12 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/_run-docker-compose.yml b/.github/workflows/_run-docker-compose.yml
index 3d02b7b4ae..f21c3202f9 100644
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -152,7 +152,14 @@ jobs:
run: |
cd ${{ github.workspace }}/$example/tests
if [[ "$IMAGE_REPO" == "" ]]; then export IMAGE_REPO="${OPEA_IMAGE_REPO}opea"; fi
- if [[ "$use_model_cache" == "true" ]]; then export model_cache="/data2/hf_model"; fi
+ if [[ "$use_model_cache" == "true" ]]; then
+ if [ -d "/data2/hf_model" ]; then
+ export model_cache="/data2/hf_model"
+ else
+ echo "Model cache directory /data2/hf_model does not exist"
+ export model_cache="~/.cache/huggingface/hub"
+ fi
+ fi
if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi
- name: Clean up container after test
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
index 28940c9ba4..f9e7e26280 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8028:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
index 4d5ed95683..62ec96e626 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8028:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/CodeGen/tests/test_compose_on_gaudi.sh b/CodeGen/tests/test_compose_on_gaudi.sh
index 9ffbc41147..e6e6d1f033 100644
--- a/CodeGen/tests/test_compose_on_gaudi.sh
+++ b/CodeGen/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/CodeGen/tests/test_compose_on_xeon.sh b/CodeGen/tests/test_compose_on_xeon.sh
index f323e72070..70e5ba9c4f 100644
--- a/CodeGen/tests/test_compose_on_xeon.sh
+++ b/CodeGen/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
index b818956fa5..0ece6dff1d 100644
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
index cbccde0605..3e25dee894 100644
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/CodeTrans/tests/test_compose_on_gaudi.sh b/CodeTrans/tests/test_compose_on_gaudi.sh
index 377937435f..e2aedcd6e9 100644
--- a/CodeTrans/tests/test_compose_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/CodeTrans/tests/test_compose_on_xeon.sh b/CodeTrans/tests/test_compose_on_xeon.sh
index 9060eb2833..efa09fe0a5 100644
--- a/CodeTrans/tests/test_compose_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
index 2c4344cc23..0d87eaeb2b 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -21,7 +21,7 @@ services:
timeout: 10s
retries: 100
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh
index e129608aa1..66dd5b3180 100644
--- a/DocSum/tests/test_compose_on_gaudi.sh
+++ b/DocSum/tests/test_compose_on_gaudi.sh
@@ -28,7 +28,7 @@ export DOCSUM_PORT=9000
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export DocSum_COMPONENT_NAME="OpeaDocSumTgi"
export LOGFLAG=True
-export DATA_PATH="/data/cache"
+export DATA_PATH=${model_cache:-"/data/cache"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/DocSum/tests/test_compose_on_xeon.sh b/DocSum/tests/test_compose_on_xeon.sh
index de208292a5..7dc194ff68 100644
--- a/DocSum/tests/test_compose_on_xeon.sh
+++ b/DocSum/tests/test_compose_on_xeon.sh
@@ -12,6 +12,7 @@ export host_ip=$(hostname -I | awk '{print $1}')
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export MODEL_CACHE=${model_cache:-"./data"}
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
export MAX_INPUT_TOKENS=2048
From fe3132e13a82fb8d09a7c852bb7bf07117ea2f7d Mon Sep 17 00:00:00 2001
From: rbrugaro
Date: Tue, 4 Mar 2025 09:44:13 -0800
Subject: [PATCH 037/226] bugfix GraphRAG updated docker compose and env
settings to fix issues post refactor (#1567)
Signed-off-by: rbrugaro
Signed-off-by: Rita Brugarolas Brufau
Co-authored-by: chen, suyue
Co-authored-by: WenjiaoYue
Signed-off-by: Chingis Yundunov
---
GraphRAG/README.md | 5 +-
.../intel/hpu/gaudi/compose.yaml | 140 +++++++++++-------
.../docker_compose/intel/hpu/gaudi/set_env.sh | 23 ++-
GraphRAG/tests/test_compose_on_gaudi.sh | 72 +++++----
4 files changed, 144 insertions(+), 96 deletions(-)
diff --git a/GraphRAG/README.md b/GraphRAG/README.md
index d654357d44..3c9de58d69 100644
--- a/GraphRAG/README.md
+++ b/GraphRAG/README.md
@@ -72,7 +72,7 @@ Here is an example of `Nike 2023` pdf.
# download pdf file
wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
# upload pdf file with dataprep
-curl -X POST "http://${host_ip}:6004/v1/dataprep/ingest" \
+curl -X POST "http://${host_ip}:11103/v1/dataprep/ingest" \
-H "Content-Type: multipart/form-data" \
-F "files=@./nke-10k-2023.pdf"
```
@@ -80,8 +80,7 @@ curl -X POST "http://${host_ip}:6004/v1/dataprep/ingest" \
```bash
curl http://${host_ip}:8888/v1/graphrag \
-H "Content-Type: application/json" \
- -d '{
- "model": "gpt-4o-mini","messages": [{"role": "user","content": "What is the revenue of Nike in 2023?
+ -d '{"messages": [{"role": "user","content": "where do Nike subsidiaries operate?
"}]}'
```
diff --git a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
index 29171a20f2..76f1ab9f63 100644
--- a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -5,52 +5,65 @@ services:
neo4j-apoc:
image: neo4j:latest
container_name: neo4j-apoc
+ ports:
+ - "${NEO4J_PORT1:-7474}:7474"
+ - "${NEO4J_PORT2:-7687}:7687"
volumes:
- - /$HOME/neo4j/logs:/logs
- - /$HOME/neo4j/config:/config
- - /$HOME/neo4j/data:/data
- - /$HOME/neo4j/plugins:/plugins
+ - ./data/neo4j/logs:/logs
+ - ./data/neo4j/config:/config
+ - ./data/neo4j/data:/data
+ - ./data/neo4j/plugins:/plugins
ipc: host
environment:
+ - no_proxy=${no_proxy}
+ - http_proxy=${http_proxy}
+ - https_proxy=${https_proxy}
- NEO4J_AUTH=${NEO4J_USERNAME}/${NEO4J_PASSWORD}
- NEO4J_PLUGINS=["apoc"]
- NEO4J_apoc_export_file_enabled=true
- NEO4J_apoc_import_file_enabled=true
- NEO4J_apoc_import_file_use__neo4j__config=true
- NEO4J_dbms_security_procedures_unrestricted=apoc.\*
- ports:
- - "7474:7474"
- - "7687:7687"
+ - NEO4J_server_bolt_advertised__address=localhost:${NEO4J_PORT2}
restart: always
- tei-embedding-service:
+ healthcheck:
+ test: wget http://localhost:7474 || exit 1
+ interval: 5s
+ timeout: 10s
+ retries: 20
+ start_period: 3s
+ tei-embedding-serving:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
- container_name: tei-embedding-server
+ container_name: tei-embedding-serving
+ entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
ports:
- - "6006:80"
+ - "${TEI_EMBEDDER_PORT:-12000}:80"
volumes:
- "./data:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
- NO_PROXY: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- ipc: host
- command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
- tgi-gaudi-service:
- image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+ host_ip: ${host_ip}
+ HF_TOKEN: ${HF_TOKEN}
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://${host_ip}:${TEI_EMBEDDER_PORT}/health"]
+ interval: 10s
+ timeout: 6s
+ retries: 48
+ tgi-gaudi-server:
+ image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
ports:
- - "6005:80"
+ - ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "./data:/data"
+ - "${DATA_PATH:-./data}:/data"
environment:
no_proxy: ${no_proxy}
- NO_PROXY: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
HF_TOKEN: ${HF_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
@@ -60,33 +73,44 @@ services:
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
+ host_ip: ${host_ip}
+ LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+ MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS:-2048}
+ MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS:-4096}
TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN: false
runtime: habana
cap_add:
- SYS_NICE
ipc: host
- command: --model-id ${LLM_MODEL_ID} --max-input-length 6000 --max-total-tokens 8192
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model-id ${LLM_MODEL_ID}
+
dataprep-neo4j-llamaindex:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
- container_name: dataprep-neo4j-server
+ container_name: dataprep-neo4j-llamaindex
depends_on:
- - neo4j-apoc
- - tgi-gaudi-service
- - tei-embedding-service
+ neo4j-apoc:
+ condition: service_healthy
+ tgi-gaudi-server:
+ condition: service_healthy
+ tei-embedding-serving:
+ condition: service_healthy
ports:
- - "6004:5000"
+ - "${DATAPREP_PORT:-11103}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
host_ip: ${host_ip}
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${HF_TOKEN}
+ DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_NEO4J_LLAMAINDEX"
NEO4J_URL: ${NEO4J_URL}
NEO4J_USERNAME: ${NEO4J_USERNAME}
NEO4J_PASSWORD: ${NEO4J_PASSWORD}
- DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_NEO4J_LLAMAINDEX"
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -94,59 +118,61 @@ services:
OPENAI_LLM_MODEL: ${OPENAI_LLM_MODEL}
EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
LLM_MODEL_ID: ${LLM_MODEL_ID}
- MAX_OUTPUT_TOKENS: ${MAX_OUTPUT_TOKENS}
LOGFLAG: ${LOGFLAG}
+ HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+ HF_TOKEN: ${HF_TOKEN}
+ MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS:-4096}
restart: unless-stopped
- retriever-neo4j-llamaindex:
+ retriever-neo4j:
image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
- container_name: retriever-neo4j-server
- depends_on:
- - neo4j-apoc
- - tgi-gaudi-service
- - tei-embedding-service
+ container_name: retriever-neo4j
ports:
- - "7000:7000"
+ - "${RETRIEVER_PORT:-7000}:7000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- host_ip: ${host_ip}
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- HF_TOKEN: ${HF_TOKEN}
- NEO4J_URI: ${NEO4J_URL}
- NEO4J_USERNAME: ${NEO4J_USERNAME}
- NEO4J_PASSWORD: ${NEO4J_PASSWORD}
- TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+ HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+ LOGFLAG: ${LOGFLAG:-False}
+ RETRIEVER_COMPONENT_NAME: ${RETRIEVER_COMPONENT_NAME:-OPEA_RETRIEVER_NEO4J}
TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
- OPENAI_API_KEY: ${OPENAI_API_KEY}
- OPENAI_EMBEDDING_MODEL: ${OPENAI_EMBEDDING_MODEL}
- OPENAI_LLM_MODEL: ${OPENAI_LLM_MODEL}
+ TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
LLM_MODEL_ID: ${LLM_MODEL_ID}
- MAX_OUTPUT_TOKENS: ${MAX_OUTPUT_TOKENS}
- LOGFLAG: ${LOGFLAG}
- RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_NEO4J"
- restart: unless-stopped
+ NEO4J_URI: ${NEO4J_URI}
+ NEO4J_URL: ${NEO4J_URI}
+ NEO4J_USERNAME: ${NEO4J_USERNAME}
+ NEO4J_PASSWORD: ${NEO4J_PASSWORD}
+ VDMS_USE_CLIP: 0
+ host_ip: ${host_ip}
+ depends_on:
+ neo4j-apoc:
+ condition: service_healthy
+ tei-embedding-serving:
+ condition: service_healthy
+ tgi-gaudi-server:
+ condition: service_healthy
graphrag-gaudi-backend-server:
image: ${REGISTRY:-opea}/graphrag:${TAG:-latest}
container_name: graphrag-gaudi-backend-server
depends_on:
- neo4j-apoc
- - tei-embedding-service
- - retriever-neo4j-llamaindex
- - tgi-gaudi-service
+ - tei-embedding-serving
+ - retriever-neo4j
+ - tgi-gaudi-server
ports:
- "8888:8888"
+ - "${MEGA_SERVICE_PORT:-8888}:8888"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- MEGA_SERVICE_HOST_IP=graphrag-gaudi-backend-server
- - RETRIEVER_SERVICE_HOST_IP=retriever-neo4j-llamaindex
+ - RETRIEVER_SERVICE_HOST_IP=retriever-neo4j
- RETRIEVER_SERVICE_PORT=7000
- - LLM_SERVER_HOST_IP=tgi-gaudi-service
- - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
+ - LLM_SERVER_HOST_IP=tgi-gaudi-server
+ - LLM_SERVER_PORT=80
- LLM_MODEL_ID=${LLM_MODEL_ID}
- LOGFLAG=${LOGFLAG}
ipc: host
diff --git a/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh b/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
index 97c462c581..a4fd8049b0 100644
--- a/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -10,16 +10,25 @@ pushd "../../../../../" > /dev/null
source .set_env.sh
popd > /dev/null
+export TEI_EMBEDDER_PORT=11633
+export LLM_ENDPOINT_PORT=11634
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export OPENAI_EMBEDDING_MODEL="text-embedding-3-small"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
export OPENAI_LLM_MODEL="gpt-4o"
-export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
-export TGI_LLM_ENDPOINT="http://${host_ip}:6005"
-export NEO4J_URL="bolt://${host_ip}:7687"
-export NEO4J_USERNAME=neo4j
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
+export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+export NEO4J_PORT1=11631
+export NEO4J_PORT2=11632
+export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
+export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
+export NEO4J_USERNAME="neo4j"
+export NEO4J_PASSWORD="neo4jtest"
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
export LOGFLAG=True
-export RETRIEVER_SERVICE_PORT=80
-export LLM_SERVER_PORT=80
-export MAX_OUTPUT_TOKENS=1024
+export MAX_INPUT_TOKENS=4096
+export MAX_TOTAL_TOKENS=8192
+export DATA_PATH="/mnt/nvme2n1/hf_cache"
+export DATAPREP_PORT=11103
+export RETRIEVER_PORT=11635
diff --git a/GraphRAG/tests/test_compose_on_gaudi.sh b/GraphRAG/tests/test_compose_on_gaudi.sh
index 17f03ce61e..bec978ad51 100755
--- a/GraphRAG/tests/test_compose_on_gaudi.sh
+++ b/GraphRAG/tests/test_compose_on_gaudi.sh
@@ -12,7 +12,7 @@ export TAG=${IMAGE_TAG}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
+export host_ip=$(hostname -I | awk '{print $1}')
function build_docker_images() {
opea_branch=${opea_branch:-"main"}
@@ -33,25 +33,38 @@ function build_docker_images() {
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+ docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
- export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
- export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+
+ export TEI_EMBEDDER_PORT=11633
+ export LLM_ENDPOINT_PORT=11634
+ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+ export OPENAI_EMBEDDING_MODEL="text-embedding-3-small"
+ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ export OPENAI_LLM_MODEL="gpt-4o"
+ export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
+ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+ export NEO4J_PORT1=11631
+ export NEO4J_PORT2=11632
+ export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
+ export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
export NEO4J_USERNAME="neo4j"
export NEO4J_PASSWORD="neo4jtest"
- export NEO4J_URL="bolt://${ip_address}:7687"
- export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
- export TGI_LLM_ENDPOINT="http://${ip_address}:6005"
- export host_ip=${ip_address}
- export LOGFLAG=true
- export MAX_OUTPUT_TOKENS="1024"
+ export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
+ export LOGFLAG=True
+ export MAX_INPUT_TOKENS=4096
+ export MAX_TOTAL_TOKENS=8192
+ export DATAPREP_PORT=11103
+ export RETRIEVER_PORT=11635
+ export MEGA_SERVICE_PORT=8888
unset OPENAI_API_KEY
# Start Docker Containers
@@ -116,7 +129,7 @@ function validate_microservices() {
# validate neo4j-apoc
validate_service \
- "${ip_address}:7474" \
+ "${host_ip}:${NEO4J_PORT1}" \
"200 OK" \
"neo4j-apoc" \
"neo4j-apoc" \
@@ -124,45 +137,46 @@ function validate_microservices() {
# tei for embedding service
validate_service \
- "${ip_address}:6006/embed" \
+ "${host_ip}:${TEI_EMBEDDER_PORT}/embed" \
"[[" \
"tei-embedding-service" \
- "tei-embedding-server" \
+ "tei-embedding-serving" \
'{"inputs":"What is Deep Learning?"}'
sleep 1m # retrieval can't curl as expected, try to wait for more time
+ # tgi for llm service
+ validate_service \
+ "${host_ip}:${LLM_ENDPOINT_PORT}/generate" \
+ "generated_text" \
+ "tgi-gaudi-service" \
+ "tgi-gaudi-server" \
+ '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
# test /v1/dataprep/ingest graph extraction
echo "Like many companies in the O&G sector, the stock of Chevron (NYSE:CVX) has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply (~25%) during that same time frame. Over the years, Chevron has kept a very strong balance sheet. FirstEnergy (NYSE:FE – Get Rating) posted its earnings results on Tuesday. The utilities provider reported $0.53 earnings per share for the quarter, topping the consensus estimate of $0.52 by $0.01, RTT News reports. FirstEnergy had a net margin of 10.85% and a return on equity of 17.17%. The Dáil was almost suspended on Thursday afternoon after Sinn Féin TD John Brady walked across the chamber and placed an on-call pager in front of the Minister for Housing Darragh O’Brien during a debate on retained firefighters. Mr O’Brien said Mr Brady had taken part in an act of theatre that was obviously choreographed.Around 2,000 retained firefighters around the country staged a second day of industrial action on Tuesday and are due to start all out-strike action from next Tuesday. The mostly part-time workers, who keep the services going outside of Ireland’s larger urban centres, are taking industrial action in a dispute over pay and working conditions. Speaking in the Dáil, Sinn Féin deputy leader Pearse Doherty said firefighters had marched on Leinster House today and were very angry at the fact the Government will not intervene. Reintroduction of tax relief on mortgages needs to be considered, O’Brien says. Martin withdraws comment after saying People Before Profit would ‘put the jackboot on people’ Taoiseach ‘propagated fears’ farmers forced to rewet land due to nature restoration law – Cairns An intervention is required now. I’m asking you to make an improved offer in relation to pay for retained firefighters, Mr Doherty told the housing minister.I’m also asking you, and challenging you, to go outside after this Order of Business and meet with the firefighters because they are just fed up to the hilt in relation to what you said.Some of them have handed in their pagers to members of the Opposition and have challenged you to wear the pager for the next number of weeks, put up with an €8,600 retainer and not leave your community for the two and a half kilometres and see how you can stand over those type of pay and conditions. At this point, Mr Brady got up from his seat, walked across the chamber and placed the pager on the desk in front of Mr O’Brien. Ceann Comhairle Seán Ó Fearghaíl said the Sinn Féin TD was completely out of order and told him not to carry out a charade in this House, adding it was absolutely outrageous behaviour and not to be encouraged.Mr O’Brien said Mr Brady had engaged in an act of theatre here today which was obviously choreographed and was then interrupted with shouts from the Opposition benches. Mr Ó Fearghaíl said he would suspend the House if this racket continues.Mr O’Brien later said he said he was confident the dispute could be resolved and he had immense regard for firefighters. The minister said he would encourage the unions to re-engage with the State’s industrial relations process while also accusing Sinn Féin of using the issue for their own political gain." > $LOG_PATH/dataprep_file.txt
validate_service \
- "http://${ip_address}:6004/v1/dataprep/ingest" \
+ "http://${host_ip}:${DATAPREP_PORT}/v1/dataprep/ingest" \
"Data preparation succeeded" \
"extract_graph_neo4j" \
- "dataprep-neo4j-server"
+ "dataprep-neo4j-llamaindex"
sleep 2m
# retrieval microservice
validate_service \
- "${ip_address}:7000/v1/retrieval" \
- "retrieved_docs" \
+ "${host_ip}:${RETRIEVER_PORT}/v1/retrieval" \
+ "documents" \
"retriever_community_answers_neo4j" \
- "retriever-neo4j-server" \
- "{\"model\": \"gpt-4o-mini\",\"messages\": [{\"role\": \"user\",\"content\": \"Who is John Brady and has he had any confrontations?\"}]}"
+ "retriever-neo4j" \
+ "{\"messages\": [{\"role\": \"user\",\"content\": \"Who is John Brady and has he had any confrontations?\"}]}"
- # tgi for llm service
- validate_service \
- "${ip_address}:6005/generate" \
- "generated_text" \
- "tgi-gaudi-service" \
- "tgi-gaudi-server" \
- '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-}
+ }
function validate_megaservice() {
# Curl the Mega Service
validate_service \
- "${ip_address}:8888/v1/graphrag" \
+ "${host_ip}:${MEGA_SERVICE_PORT}/v1/graphrag" \
"data: " \
"graphrag-megaservice" \
"graphrag-gaudi-backend-server" \
@@ -181,7 +195,7 @@ function validate_frontend() {
fi
source activate ${conda_env_name}
- sed -i "s/localhost/$ip_address/g" playwright.config.ts
+ sed -i "s/localhost/$host_ip/g" playwright.config.ts
conda install -c conda-forge nodejs=22.6.0 -y
npm install && npm ci && npx playwright install --with-deps
From a9154e8e29ca298a53c2ff04b7e87f52bb7584fb Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Wed, 5 Mar 2025 11:30:04 +0800
Subject: [PATCH 038/226] Enable ChatQnA model cache for docker compose test.
(#1605)
Enable ChatQnA model cache for docker compose test.
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml | 6 +++---
ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml | 6 +++---
.../docker_compose/intel/cpu/xeon/compose_pinecone.yaml | 6 +++---
ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml | 6 +++---
ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml | 6 +++---
.../intel/cpu/xeon/compose_without_rerank.yaml | 4 ++--
ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 6 +++---
.../intel/hpu/gaudi/compose_guardrails.yaml | 8 ++++----
ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml | 6 +++---
.../intel/hpu/gaudi/compose_without_rerank.yaml | 4 ++--
ChatQnA/tests/test_compose_guardrails_on_gaudi.sh | 1 +
ChatQnA/tests/test_compose_on_gaudi.sh | 1 +
ChatQnA/tests/test_compose_on_rocm.sh | 1 +
ChatQnA/tests/test_compose_on_xeon.sh | 1 +
ChatQnA/tests/test_compose_pinecone_on_xeon.sh | 1 +
ChatQnA/tests/test_compose_qdrant_on_xeon.sh | 1 +
ChatQnA/tests/test_compose_tgi_on_gaudi.sh | 1 +
ChatQnA/tests/test_compose_tgi_on_xeon.sh | 1 +
ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh | 1 +
ChatQnA/tests/test_compose_without_rerank_on_xeon.sh | 1 +
20 files changed, 39 insertions(+), 29 deletions(-)
diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
index 11c0b78cae..193f4346e7 100644
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -30,7 +30,7 @@ services:
ports:
- "${CHATQNA_TEI_EMBEDDING_PORT}:80"
volumes:
- - "/var/opea/chatqna-service/data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
ipc: host
environment:
@@ -72,7 +72,7 @@ services:
ports:
- "${CHATQNA_TEI_RERANKING_PORT}:80"
volumes:
- - "/var/opea/chatqna-service/data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -104,7 +104,7 @@ services:
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
volumes:
- - "/var/opea/chatqna-service/data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
devices:
- /dev/kfd:/dev/kfd
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 3c3e6f49a7..00c6a2aec2 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
ports:
- "9009:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
index de784dfabd..a2d2318945 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -28,7 +28,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -59,7 +59,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -75,7 +75,7 @@ services:
ports:
- "9009:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
index 46123d3e90..8a7fabdfad 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -32,7 +32,7 @@ services:
ports:
- "6040:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "6041:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
ports:
- "6042:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index 5831181370..4a7c4f4627 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
ports:
- "9009:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index 917c6ee078..72fbdead0a 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "9009:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index b75312824e..855613fbc2 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8090:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -62,7 +62,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -83,7 +83,7 @@ services:
ports:
- "8007:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
index d5b56e424c..bd1b3cc0ff 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8088:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
@@ -70,7 +70,7 @@ services:
ports:
- "8090:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -124,7 +124,7 @@ services:
ports:
- "8008:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
index 7fb743e814..fd27be4dfd 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8090:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -85,7 +85,7 @@ services:
ports:
- "8005:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
index 2d872c4d30..6f8c7fe0dd 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8090:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8007:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
index 9a70a4aff0..d667a89f3c 100644
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index d9b40529e8..8858900148 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh
index 732e2684aa..f9623f1691 100644
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/var/opea/chatqna-service/data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
index e3cd8db1d1..bdab2637bf 100644
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
index b45d53871c..17f32ed6cc 100755
--- a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
index a8539d617c..8c84a9a9ff 100644
--- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
index 6b93618932..25bfe8cdee 100644
--- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_tgi_on_xeon.sh b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
index 0a9687a5ff..f00d8c6436 100644
--- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
index 5975358f29..9e9d7df735 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
index 50e3feb243..279bc780d0 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
From 10fb9288100b3e5c59949f5bef05b3353d1fac30 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Wed, 5 Mar 2025 17:13:24 +0800
Subject: [PATCH 039/226] Enable SearchQnA model cache for docker compose test.
(#1606)
Enable SearchQnA model cache for docker compose test.
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml | 6 +++---
SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml | 6 +++---
SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 6 +++---
SearchQnA/tests/test_compose_on_gaudi.sh | 1 +
SearchQnA/tests/test_compose_on_rocm.sh | 1 +
SearchQnA/tests/test_compose_on_xeon.sh | 1 +
6 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml b/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
index f531281271..f8318de8fd 100644
--- a/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -10,7 +10,7 @@ services:
ports:
- "3001:80"
volumes:
- - "./data:/data"
+ - "${MODEL_PATH}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -56,7 +56,7 @@ services:
ports:
- "3004:80"
volumes:
- - "./data:/data"
+ - "${MODEL_PATH}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -86,7 +86,7 @@ services:
ports:
- "3006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_PATH}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 61f5f2a2fc..7ce41a4205 100644
--- a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -9,7 +9,7 @@ services:
ports:
- "3001:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -60,7 +60,7 @@ services:
ports:
- "3004:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -96,7 +96,7 @@ services:
ports:
- "3006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index f79bb9758c..7ad5990b3d 100644
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -9,7 +9,7 @@ services:
ports:
- "3001:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -67,7 +67,7 @@ services:
ports:
- "3004:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
ports:
- "3006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/SearchQnA/tests/test_compose_on_gaudi.sh b/SearchQnA/tests/test_compose_on_gaudi.sh
index bf357eebfb..e73d921b7a 100644
--- a/SearchQnA/tests/test_compose_on_gaudi.sh
+++ b/SearchQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/SearchQnA/tests/test_compose_on_rocm.sh b/SearchQnA/tests/test_compose_on_rocm.sh
index cebe86133f..27de2b9bb0 100644
--- a/SearchQnA/tests/test_compose_on_rocm.sh
+++ b/SearchQnA/tests/test_compose_on_rocm.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_PATH=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/SearchQnA/tests/test_compose_on_xeon.sh b/SearchQnA/tests/test_compose_on_xeon.sh
index 121d4db9d4..aa8c3aa6e7 100644
--- a/SearchQnA/tests/test_compose_on_xeon.sh
+++ b/SearchQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
From f746c78000050deea6541be8f12268dfba22ce93 Mon Sep 17 00:00:00 2001
From: Zhu Yongbo
Date: Wed, 5 Mar 2025 22:13:53 +0800
Subject: [PATCH 040/226] Fix docker image opea/edgecraftrag security issue
#1577 (#1617)
Signed-off-by: Zhu, Yongbo
Signed-off-by: Chingis Yundunov
---
EdgeCraftRAG/Dockerfile.server | 3 ++-
EdgeCraftRAG/edgecraftrag/requirements.txt | 4 ++--
EdgeCraftRAG/ui/docker/Dockerfile.ui | 2 +-
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/EdgeCraftRAG/Dockerfile.server b/EdgeCraftRAG/Dockerfile.server
index 3bb572f116..ab4060de80 100644
--- a/EdgeCraftRAG/Dockerfile.server
+++ b/EdgeCraftRAG/Dockerfile.server
@@ -37,7 +37,8 @@ RUN mkdir -p /home/user/gradio_cache
ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
WORKDIR /home/user/edgecraftrag
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir --upgrade pip setuptools==70.0.0 && \
+ pip install --no-cache-dir -r requirements.txt
WORKDIR /home/user/
diff --git a/EdgeCraftRAG/edgecraftrag/requirements.txt b/EdgeCraftRAG/edgecraftrag/requirements.txt
index 5ac0137fd1..becbfa767a 100644
--- a/EdgeCraftRAG/edgecraftrag/requirements.txt
+++ b/EdgeCraftRAG/edgecraftrag/requirements.txt
@@ -1,6 +1,6 @@
docx2txt
faiss-cpu>=1.8.0.post1
-langchain-core==0.2.29
+langchain-core>=0.2.29
llama-index>=0.11.0
llama-index-embeddings-openvino>=0.4.0
llama-index-llms-openai-like>=0.2.0
@@ -9,7 +9,7 @@ llama-index-postprocessor-openvino-rerank>=0.3.0
llama-index-readers-file>=0.4.0
llama-index-retrievers-bm25>=0.3.0
llama-index-vector-stores-faiss>=0.2.1
-opea-comps>=0.9
+opea-comps>=1.2
pillow>=10.4.0
python-docx==1.1.2
unstructured==0.16.11
diff --git a/EdgeCraftRAG/ui/docker/Dockerfile.ui b/EdgeCraftRAG/ui/docker/Dockerfile.ui
index 8abffc5557..8f8b9b0fb6 100644
--- a/EdgeCraftRAG/ui/docker/Dockerfile.ui
+++ b/EdgeCraftRAG/ui/docker/Dockerfile.ui
@@ -15,7 +15,7 @@ RUN mkdir -p /home/user/gradio_cache
ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
WORKDIR /home/user/ui
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
+RUN pip install --no-cache-dir --upgrade pip setuptools==70.0.0 && \
pip install --no-cache-dir -r requirements.txt
USER user
From a7c83e31559c6171b2061da8786302168d04402d Mon Sep 17 00:00:00 2001
From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com>
Date: Wed, 5 Mar 2025 22:15:07 +0800
Subject: [PATCH 041/226] [AudioQnA] Fix the LLM model field for inputs
alignment (#1611)
Signed-off-by: Wang, Kai Lawrence
Signed-off-by: Chingis Yundunov
---
AudioQnA/audioqna.py | 3 ++-
AudioQnA/audioqna_multilang.py | 3 ++-
AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml | 1 +
AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml | 1 +
AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 1 +
5 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/AudioQnA/audioqna.py b/AudioQnA/audioqna.py
index 79abcccb96..f74e58053f 100644
--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,13 +16,14 @@
SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
if self.services[cur_node].service_type == ServiceType.LLM:
# convert TGI/vLLM to unified OpenAI /v1/chat/completions format
next_inputs = {}
- next_inputs["model"] = "tgi" # specifically clarify the fake model to make the format unified
+ next_inputs["model"] = LLM_MODEL_ID
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
next_inputs["top_p"] = llm_parameters_dict["top_p"]
diff --git a/AudioQnA/audioqna_multilang.py b/AudioQnA/audioqna_multilang.py
index 66c2ad1a37..edc14cc93c 100644
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,6 +17,7 @@
GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -24,7 +25,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
if self.services[cur_node].service_type == ServiceType.LLM:
# convert TGI/vLLM to unified OpenAI /v1/chat/completions format
next_inputs = {}
- next_inputs["model"] = "tgi" # specifically clarify the fake model to make the format unified
+ next_inputs["model"] = LLM_MODEL_ID
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
next_inputs["top_p"] = llm_parameters_dict["top_p"]
diff --git a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
index 4cef1598c2..646b079a29 100644
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -69,6 +69,7 @@ services:
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
+ - LLM_MODEL_ID=${LLM_MODEL_ID}
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
ipc: host
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 48756c00b6..cf9579960b 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -61,6 +61,7 @@ services:
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
+ - LLM_MODEL_ID=${LLM_MODEL_ID}
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
ipc: host
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 45691f478b..bcbbac0070 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -82,6 +82,7 @@ services:
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
+ - LLM_MODEL_ID=${LLM_MODEL_ID}
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
ipc: host
From db31e553a6f114fd67a0eb6a2405682986fb6b6d Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Fri, 7 Mar 2025 08:38:59 +0800
Subject: [PATCH 042/226] Update compose.yaml for SearchQnA (#1622)
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml | 6 +++---
SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml | 6 +++---
SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 6 +++---
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml b/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
index f8318de8fd..fef008250d 100644
--- a/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -10,7 +10,7 @@ services:
ports:
- "3001:80"
volumes:
- - "${MODEL_PATH}:/data"
+ - "${MODEL_PATH:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -56,7 +56,7 @@ services:
ports:
- "3004:80"
volumes:
- - "${MODEL_PATH}:/data"
+ - "${MODEL_PATH:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -86,7 +86,7 @@ services:
ports:
- "3006:80"
volumes:
- - "${MODEL_PATH}:/data"
+ - "${MODEL_PATH:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 7ce41a4205..29b5229b83 100644
--- a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -9,7 +9,7 @@ services:
ports:
- "3001:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -60,7 +60,7 @@ services:
ports:
- "3004:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -96,7 +96,7 @@ services:
ports:
- "3006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 7ad5990b3d..d1df099437 100644
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -9,7 +9,7 @@ services:
ports:
- "3001:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -67,7 +67,7 @@ services:
ports:
- "3004:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
ports:
- "3006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
From 3b33f30ec8ddb7670be5852da228e79282c23477 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Fri, 7 Mar 2025 09:19:39 +0800
Subject: [PATCH 043/226] Update compose.yaml for ChatQnA (#1621)
Update compose.yaml for ChatQnA
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml | 6 +++---
ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml | 6 +++---
.../docker_compose/intel/cpu/xeon/compose_pinecone.yaml | 6 +++---
ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml | 6 +++---
ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml | 6 +++---
.../intel/cpu/xeon/compose_without_rerank.yaml | 4 ++--
ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 6 +++---
.../intel/hpu/gaudi/compose_guardrails.yaml | 8 ++++----
ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml | 6 +++---
.../intel/hpu/gaudi/compose_without_rerank.yaml | 4 ++--
10 files changed, 29 insertions(+), 29 deletions(-)
diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
index 193f4346e7..da1f4ddda4 100644
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -30,7 +30,7 @@ services:
ports:
- "${CHATQNA_TEI_EMBEDDING_PORT}:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
shm_size: 1g
ipc: host
environment:
@@ -72,7 +72,7 @@ services:
ports:
- "${CHATQNA_TEI_RERANKING_PORT}:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -104,7 +104,7 @@ services:
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
shm_size: 1g
devices:
- /dev/kfd:/dev/kfd
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 00c6a2aec2..1ec229115e 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "6006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
ports:
- "9009:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
index a2d2318945..a398e9d983 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -28,7 +28,7 @@ services:
ports:
- "6006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -59,7 +59,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -75,7 +75,7 @@ services:
ports:
- "9009:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
index 8a7fabdfad..0504ff07a1 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -32,7 +32,7 @@ services:
ports:
- "6040:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "6041:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
ports:
- "6042:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index 4a7c4f4627..34d95ffc68 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "6006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
ports:
- "9009:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index 72fbdead0a..70ea084408 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "6006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "9009:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 855613fbc2..8ff06ecc35 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8090:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -62,7 +62,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -83,7 +83,7 @@ services:
ports:
- "8007:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
index bd1b3cc0ff..b3388e0b5f 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8088:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
@@ -70,7 +70,7 @@ services:
ports:
- "8090:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -124,7 +124,7 @@ services:
ports:
- "8008:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
index fd27be4dfd..a14e3fca67 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8090:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -85,7 +85,7 @@ services:
ports:
- "8005:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
index 6f8c7fe0dd..167ce9c1e1 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "8090:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
ports:
- "8007:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
From e6d0c279d0619af1894c350c9abceca17153f678 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Fri, 7 Mar 2025 09:20:08 +0800
Subject: [PATCH 044/226] Update compose.yaml (#1620)
Update compose.yaml for AudioQnA, DBQnA, DocIndexRetriever, FaqGen, Translation and VisualQnA.
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml | 2 +-
AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
DBQnA/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml | 4 ++--
DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml | 4 ++--
FaqGen/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
Translation/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
Translation/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
11 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
index cf9579960b..3b47780d80 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -30,7 +30,7 @@ services:
ports:
- "3006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
index c6ad650943..fde5a56902 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -31,7 +31,7 @@ services:
ports:
- "3006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index bcbbac0070..9e43a355b5 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -40,7 +40,7 @@ services:
ports:
- "3006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 8e4c15bd6b..b96a71d01d 100644
--- a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
index 6ecebfdc23..119b460d92 100644
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
@@ -38,7 +38,7 @@ services:
ports:
- "6006:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -96,7 +96,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
index f47d01a7cf..a2bfd878fc 100644
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -34,7 +34,7 @@ services:
ports:
- "8090:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
runtime: habana
cap_add:
- SYS_NICE
@@ -95,7 +95,7 @@ services:
ports:
- "8808:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
index ea24486cda..ca86a18f2d 100644
--- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/Translation/docker_compose/intel/cpu/xeon/compose.yaml b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
index d1a6ee337d..4b77d84484 100644
--- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
@@ -21,7 +21,7 @@ services:
timeout: 10s
retries: 100
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
diff --git a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
index 7e49db9c39..9516e60ce6 100644
--- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -30,7 +30,7 @@ services:
- SYS_NICE
ipc: host
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
llm:
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 4a81704be4..b595bdcba7 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8399:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 73e2747085..bd4004e399 100644
--- a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8399:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
From caefabe22045f4ef34f897c93a2bf612bf790f70 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Fri, 7 Mar 2025 09:20:28 +0800
Subject: [PATCH 045/226] Update compose.yaml (#1619)
Update compose.yaml for CodeGen, CodeTrans and DocSum
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
CodeGen/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
DocSum/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
index f9e7e26280..7973951000 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8028:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
index 62ec96e626..19a77bef54 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8028:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
index 0ece6dff1d..2028760c48 100644
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
index 3e25dee894..e697a0927b 100644
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
index 0d87eaeb2b..8d91db5e73 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -21,7 +21,7 @@ services:
timeout: 10s
retries: 100
volumes:
- - "${MODEL_CACHE}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
From a74dc1e988bf1723c9fe351eee2fc42c7ac84269 Mon Sep 17 00:00:00 2001
From: Letong Han <106566639+letonghan@users.noreply.github.com>
Date: Fri, 7 Mar 2025 10:56:21 +0800
Subject: [PATCH 046/226] Enable vllm for CodeTrans (#1626)
Set vllm as default llm serving, and add related docker compose files, readmes, and test scripts.
Issue: https://github.com/opea-project/GenAIExamples/issues/1436
Signed-off-by: letonghan
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
.../docker_compose/intel/cpu/xeon/README.md | 71 ++++++-
.../intel/cpu/xeon/compose.yaml | 26 +--
.../intel/cpu/xeon/compose_tgi.yaml | 95 +++++++++
.../docker_compose/intel/hpu/gaudi/README.md | 68 +++++-
.../intel/hpu/gaudi/compose.yaml | 40 ++--
.../intel/hpu/gaudi/compose_tgi.yaml | 99 +++++++++
CodeTrans/docker_compose/set_env.sh | 7 +-
CodeTrans/docker_image_build/build.yaml | 12 ++
CodeTrans/tests/test_compose_on_gaudi.sh | 33 ++-
CodeTrans/tests/test_compose_on_xeon.sh | 35 ++--
CodeTrans/tests/test_compose_tgi_on_gaudi.sh | 194 ++++++++++++++++++
CodeTrans/tests/test_compose_tgi_on_xeon.sh | 194 ++++++++++++++++++
12 files changed, 801 insertions(+), 73 deletions(-)
create mode 100644 CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
create mode 100644 CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
create mode 100644 CodeTrans/tests/test_compose_tgi_on_gaudi.sh
create mode 100644 CodeTrans/tests/test_compose_tgi_on_xeon.sh
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/README.md b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
index b5aebe8690..a7a8066202 100755
--- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
@@ -2,6 +2,8 @@
This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
+
## 🚀 Create an AWS Xeon Instance
To run the example on a AWS Xeon instance, start by creating an AWS account if you don't have one already. Then, get started with the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home). AWS EC2 M7i, C7i, C7i-flex and M7i-flex are Intel Xeon Scalable processor instances suitable for the task. (code named Sapphire Rapids).
@@ -63,6 +65,37 @@ By default, the LLM model is set to a default value as listed below:
Change the `LLM_MODEL_ID` below for your needs.
+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
+
+1. Online
+
+ ```bash
+ export HF_TOKEN=${your_hf_token}
+ export HF_ENDPOINT="https://hf-mirror.com"
+ model_name="mistralai/Mistral-7B-Instruct-v0.3"
+ # Start vLLM LLM Service
+ docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+ # Start TGI LLM Service
+ docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+ ```
+
+2. Offline
+
+ - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
+
+ - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
+
+ - Run the following command to start the LLM service.
+
+ ```bash
+ export HF_TOKEN=${your_hf_token}
+ export model_path="/path/to/model"
+ # Start vLLM LLM Service
+ docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+ # Start TGI LLM Service
+ docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+ ```
+
### Setup Environment Variables
1. Set the required environment variables:
@@ -95,15 +128,47 @@ Change the `LLM_MODEL_ID` below for your needs.
```bash
cd GenAIExamples/CodeTrans/docker_compose/intel/cpu/xeon
-docker compose up -d
+```
+
+If use vLLM as the LLM serving backend.
+
+```bash
+docker compose -f compose.yaml up -d
+```
+
+If use TGI as the LLM serving backend.
+
+```bash
+docker compose -f compose_tgi.yaml up -d
```
### Validate Microservices
-1. TGI Service
+1. LLM backend Service
+
+ In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+
+ Try the command below to check whether the LLM serving is ready.
+
+ ```bash
+ # vLLM service
+ docker logs codetrans-xeon-vllm-service 2>&1 | grep complete
+ # If the service is ready, you will get the response like below.
+ INFO: Application startup complete.
+ ```
+
+ ```bash
+ # TGI service
+ docker logs codetrans-xeon-tgi-service | grep Connected
+ # If the service is ready, you will get the response like below.
+ 2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
+ ```
+
+ Then try the `cURL` command below to validate services.
```bash
- curl http://${host_ip}:8008/generate \
+ # either vLLM or TGI service
+ curl http://${host_ip}:8008/v1/chat/completions \
-X POST \
-d '{"inputs":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
index 2028760c48..24c8bfdd39 100644
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
services:
- tgi-service:
- image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
- container_name: codetrans-tgi-service
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ container_name: codetrans-xeon-vllm-service
ports:
- "8008:80"
volumes:
@@ -15,18 +15,19 @@ services:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- host_ip: ${host_ip}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
- command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
llm:
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
- container_name: llm-textgen-server
+ container_name: codetrans-xeon-llm-server
depends_on:
- tgi-service:
+ vllm-service:
condition: service_healthy
ports:
- "9000:9000"
@@ -35,18 +36,19 @@ services:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
- HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
codetrans-xeon-backend-server:
image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
container_name: codetrans-xeon-backend-server
depends_on:
- - tgi-service
+ - vllm-service
- llm
ports:
- - "7777:7777"
+ - "${BACKEND_SERVICE_PORT:-7777}:7777"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
@@ -61,7 +63,7 @@ services:
depends_on:
- codetrans-xeon-backend-server
ports:
- - "5173:5173"
+ - "${FRONTEND_SERVICE_PORT:-5173}:5173"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
new file mode 100644
index 0000000000..77c668241c
--- /dev/null
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,95 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ tgi-service:
+ image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+ container_name: codetrans-xeon-tgi-service
+ ports:
+ - "8008:80"
+ volumes:
+ - "${MODEL_CACHE}:/data"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ host_ip: ${host_ip}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ llm:
+ image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+ container_name: codetrans-xeon-llm-server
+ depends_on:
+ tgi-service:
+ condition: service_healthy
+ ports:
+ - "9000:9000"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ restart: unless-stopped
+ codetrans-xeon-backend-server:
+ image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
+ container_name: codetrans-xeon-backend-server
+ depends_on:
+ - tgi-service
+ - llm
+ ports:
+ - "${BACKEND_SERVICE_PORT:-7777}:7777"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+ - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+ codetrans-xeon-ui-server:
+ image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
+ container_name: codetrans-xeon-ui-server
+ depends_on:
+ - codetrans-xeon-backend-server
+ ports:
+ - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+ codetrans-xeon-nginx-server:
+ image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+ container_name: codetrans-xeon-nginx-server
+ depends_on:
+ - codetrans-xeon-backend-server
+ - codetrans-xeon-ui-server
+ ports:
+ - "${NGINX_PORT:-80}:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
+ - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
+ - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
+ - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
+ - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
index 00241d6acf..cf5f2d3c11 100755
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,8 @@
This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
+
## 🚀 Build Docker Images
First of all, you need to build Docker Images locally and install the python package of it. This step can be ignored after the Docker images published to Docker hub.
@@ -55,6 +57,37 @@ By default, the LLM model is set to a default value as listed below:
Change the `LLM_MODEL_ID` below for your needs.
+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
+
+1. Online
+
+ ```bash
+ export HF_TOKEN=${your_hf_token}
+ export HF_ENDPOINT="https://hf-mirror.com"
+ model_name="mistralai/Mistral-7B-Instruct-v0.3"
+ # Start vLLM LLM Service
+ docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+ # Start TGI LLM Service
+ docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+ ```
+
+2. Offline
+
+ - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
+
+ - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
+
+ - Run the following command to start the LLM service.
+
+ ```bash
+ export HF_TOKEN=${your_hf_token}
+ export model_path="/path/to/model"
+ # Start vLLM LLM Service
+ docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+ # Start TGI LLM Service
+ docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+ ```
+
### Setup Environment Variables
1. Set the required environment variables:
@@ -87,12 +120,43 @@ Change the `LLM_MODEL_ID` below for your needs.
```bash
cd GenAIExamples/CodeTrans/docker_compose/intel/hpu/gaudi
-docker compose up -d
+```
+
+If use vLLM as the LLM serving backend.
+
+```bash
+docker compose -f compose.yaml up -d
+```
+
+If use TGI as the LLM serving backend.
+
+```bash
+docker compose -f compose_tgi.yaml up -d
```
### Validate Microservices
-1. TGI Service
+1. LLM backend Service
+
+ In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+
+ Try the command below to check whether the LLM serving is ready.
+
+ ```bash
+ # vLLM service
+ docker logs codetrans-gaudi-vllm-service 2>&1 | grep complete
+ # If the service is ready, you will get the response like below.
+ INFO: Application startup complete.
+ ```
+
+ ```bash
+ # TGI service
+ docker logs codetrans-gaudi-tgi-service | grep Connected
+ # If the service is ready, you will get the response like below.
+ 2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
+ ```
+
+ Then try the `cURL` command below to validate services.
```bash
curl http://${host_ip}:8008/generate \
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
index e697a0927b..2caeaf0ec3 100644
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
services:
- tgi-service:
- image: ghcr.io/huggingface/tgi-gaudi:2.0.6
- container_name: codetrans-tgi-service
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+ container_name: codetrans-gaudi-vllm-service
ports:
- "8008:80"
volumes:
@@ -13,28 +13,27 @@ services:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- ENABLE_HPU_GRAPH: true
- LIMIT_HPU_GRAPH: true
- USE_FLASH_ATTENTION: true
- FLASH_ATTENTION_RECOMPUTE: true
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ NUM_CARDS: ${NUM_CARDS}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
healthcheck:
- test: ["CMD-SHELL", "sleep 500 && exit 0"]
- interval: 1s
- timeout: 505s
- retries: 1
+ test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
runtime: habana
cap_add:
- SYS_NICE
ipc: host
- command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+ command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
llm:
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
- container_name: llm-textgen-gaudi-server
+ container_name: codetrans-xeon-llm-server
depends_on:
- tgi-service:
+ vllm-service:
condition: service_healthy
ports:
- "9000:9000"
@@ -43,18 +42,19 @@ services:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
- HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
codetrans-gaudi-backend-server:
image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
container_name: codetrans-gaudi-backend-server
depends_on:
- - tgi-service
+ - vllm-service
- llm
ports:
- - "7777:7777"
+ - "${BACKEND_SERVICE_PORT:-7777}:7777"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
@@ -69,7 +69,7 @@ services:
depends_on:
- codetrans-gaudi-backend-server
ports:
- - "5173:5173"
+ - "${FRONTEND_SERVICE_PORT:-5173}:5173"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
new file mode 100644
index 0000000000..9bcc01f318
--- /dev/null
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,99 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ tgi-service:
+ image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+ container_name: codetrans-gaudi-tgi-service
+ ports:
+ - "8008:80"
+ volumes:
+ - "${MODEL_CACHE}:/data"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ HABANA_VISIBLE_DEVICES: all
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ ENABLE_HPU_GRAPH: true
+ LIMIT_HPU_GRAPH: true
+ USE_FLASH_ATTENTION: true
+ FLASH_ATTENTION_RECOMPUTE: true
+ runtime: habana
+ cap_add:
+ - SYS_NICE
+ ipc: host
+ command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
+ llm:
+ image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+ container_name: codetrans-gaudi-llm-server
+ depends_on:
+ - tgi-service
+ ports:
+ - "9000:9000"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ restart: unless-stopped
+ codetrans-gaudi-backend-server:
+ image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
+ container_name: codetrans-gaudi-backend-server
+ depends_on:
+ - tgi-service
+ - llm
+ ports:
+ - "${BACKEND_SERVICE_PORT:-7777}:7777"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+ - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+ ipc: host
+ restart: always
+ codetrans-gaudi-ui-server:
+ image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
+ container_name: codetrans-gaudi-ui-server
+ depends_on:
+ - codetrans-gaudi-backend-server
+ ports:
+ - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+ codetrans-gaudi-nginx-server:
+ image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+ container_name: codetrans-gaudi-nginx-server
+ depends_on:
+ - codetrans-gaudi-backend-server
+ - codetrans-gaudi-ui-server
+ ports:
+ - "${NGINX_PORT:-80}:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
+ - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
+ - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
+ - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
+ - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/CodeTrans/docker_compose/set_env.sh b/CodeTrans/docker_compose/set_env.sh
index b44c763a2e..d24bc1c20b 100644
--- a/CodeTrans/docker_compose/set_env.sh
+++ b/CodeTrans/docker_compose/set_env.sh
@@ -8,7 +8,12 @@ popd > /dev/null
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
+export LLM_ENDPOINT="http://${host_ip}:8008"
+export LLM_COMPONENT_NAME="OpeaTextGenService"
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${host_ip}
export LLM_SERVICE_HOST_IP=${host_ip}
export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7777/v1/codetrans"
diff --git a/CodeTrans/docker_image_build/build.yaml b/CodeTrans/docker_image_build/build.yaml
index bfc0070619..e42102170f 100644
--- a/CodeTrans/docker_image_build/build.yaml
+++ b/CodeTrans/docker_image_build/build.yaml
@@ -23,6 +23,18 @@ services:
dockerfile: comps/llms/src/text-generation/Dockerfile
extends: codetrans
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+ vllm:
+ build:
+ context: vllm
+ dockerfile: Dockerfile.cpu
+ extends: codetrans
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ vllm-gaudi:
+ build:
+ context: vllm-fork
+ dockerfile: Dockerfile.hpu
+ extends: codetrans
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
nginx:
build:
context: GenAIComps
diff --git a/CodeTrans/tests/test_compose_on_gaudi.sh b/CodeTrans/tests/test_compose_on_gaudi.sh
index e2aedcd6e9..9c78ea5972 100644
--- a/CodeTrans/tests/test_compose_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_on_gaudi.sh
@@ -30,12 +30,12 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="codetrans codetrans-ui llm-textgen nginx"
+ service_list="codetrans codetrans-ui llm-textgen vllm-gaudi nginx"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
@@ -45,7 +45,12 @@ function start_services() {
export http_proxy=${http_proxy}
export https_proxy=${http_proxy}
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
- export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
+ export LLM_ENDPOINT="http://${ip_address}:8008"
+ export LLM_COMPONENT_NAME="OpeaTextGenService"
+ export NUM_CARDS=1
+ export BLOCK_SIZE=128
+ export MAX_NUM_SEQS=256
+ export MAX_SEQ_LEN_TO_CAPTURE=2048
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export MEGA_SERVICE_HOST_IP=${ip_address}
export LLM_SERVICE_HOST_IP=${ip_address}
@@ -65,13 +70,15 @@ function start_services() {
n=0
until [[ "$n" -ge 100 ]]; do
- docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
- if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+ docker logs codetrans-gaudi-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+ if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
break
fi
sleep 5s
n=$((n+1))
done
+
+ sleep 1m
}
function validate_services() {
@@ -103,27 +110,19 @@ function validate_services() {
}
function validate_microservices() {
- # tgi for embedding service
- validate_services \
- "${ip_address}:8008/generate" \
- "generated_text" \
- "tgi" \
- "codetrans-tgi-service" \
- '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
# llm microservice
validate_services \
"${ip_address}:9000/v1/chat/completions" \
"data: " \
"llm" \
- "llm-textgen-gaudi-server" \
+ "codetrans-xeon-llm-server" \
'{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
}
function validate_megaservice() {
# Curl the Mega Service
validate_services \
- "${ip_address}:7777/v1/codetrans" \
+ "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
"print" \
"mega-codetrans" \
"codetrans-gaudi-backend-server" \
@@ -131,7 +130,7 @@ function validate_megaservice() {
# test the megeservice via nginx
validate_services \
- "${ip_address}:80/v1/codetrans" \
+ "${ip_address}:${NGINX_PORT}/v1/codetrans" \
"print" \
"mega-codetrans-nginx" \
"codetrans-gaudi-nginx-server" \
@@ -170,7 +169,7 @@ function validate_frontend() {
function stop_docker() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
- docker compose stop && docker compose rm -f
+ docker compose -f compose.yaml stop && docker compose rm -f
}
function main() {
diff --git a/CodeTrans/tests/test_compose_on_xeon.sh b/CodeTrans/tests/test_compose_on_xeon.sh
index efa09fe0a5..23660848d5 100644
--- a/CodeTrans/tests/test_compose_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_on_xeon.sh
@@ -30,12 +30,16 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ git clone https://github.com/vllm-project/vllm.git && cd vllm
+ VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null
+ cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="codetrans codetrans-ui llm-textgen nginx"
+ service_list="codetrans codetrans-ui llm-textgen vllm nginx"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
docker images && sleep 1s
}
@@ -44,7 +48,8 @@ function start_services() {
export http_proxy=${http_proxy}
export https_proxy=${http_proxy}
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
- export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
+ export LLM_ENDPOINT="http://${ip_address}:8008"
+ export LLM_COMPONENT_NAME="OpeaTextGenService"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export MEGA_SERVICE_HOST_IP=${ip_address}
export LLM_SERVICE_HOST_IP=${ip_address}
@@ -60,17 +65,19 @@ function start_services() {
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
# Start Docker Containers
- docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+ docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 100 ]]; do
- docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
- if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+ docker logs codetrans-xeon-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+ if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
break
fi
sleep 5s
n=$((n+1))
done
+
+ sleep 1m
}
function validate_services() {
@@ -102,20 +109,12 @@ function validate_services() {
}
function validate_microservices() {
- # tgi for embedding service
- validate_services \
- "${ip_address}:8008/generate" \
- "generated_text" \
- "tgi" \
- "codetrans-tgi-service" \
- '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
# llm microservice
validate_services \
"${ip_address}:9000/v1/chat/completions" \
"data: " \
"llm" \
- "llm-textgen-server" \
+ "codetrans-xeon-llm-server" \
'{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
}
@@ -123,7 +122,7 @@ function validate_microservices() {
function validate_megaservice() {
# Curl the Mega Service
validate_services \
- "${ip_address}:7777/v1/codetrans" \
+ "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
"print" \
"mega-codetrans" \
"codetrans-xeon-backend-server" \
@@ -131,7 +130,7 @@ function validate_megaservice() {
# test the megeservice via nginx
validate_services \
- "${ip_address}:80/v1/codetrans" \
+ "${ip_address}:${NGINX_PORT}/v1/codetrans" \
"print" \
"mega-codetrans-nginx" \
"codetrans-xeon-nginx-server" \
@@ -169,7 +168,7 @@ function validate_frontend() {
function stop_docker() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
- docker compose stop && docker compose rm -f
+ docker compose -f compose.yaml stop && docker compose rm -f
}
function main() {
diff --git a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
new file mode 100644
index 0000000000..1c0404d397
--- /dev/null
+++ b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="codetrans codetrans-ui llm-textgen nginx"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/hpu/gaudi/
+ export http_proxy=${http_proxy}
+ export https_proxy=${http_proxy}
+ export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
+ export LLM_ENDPOINT="http://${ip_address}:8008"
+ export LLM_COMPONENT_NAME="OpeaTextGenService"
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export MEGA_SERVICE_HOST_IP=${ip_address}
+ export LLM_SERVICE_HOST_IP=${ip_address}
+ export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
+ export FRONTEND_SERVICE_IP=${ip_address}
+ export FRONTEND_SERVICE_PORT=5173
+ export BACKEND_SERVICE_NAME=codetrans
+ export BACKEND_SERVICE_IP=${ip_address}
+ export BACKEND_SERVICE_PORT=7777
+ export NGINX_PORT=80
+ export host_ip=${ip_address}
+
+ sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+ # Start Docker Containers
+ docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+ n=0
+ until [[ "$n" -ge 100 ]]; do
+ docker logs codetrans-gaudi-tgi-service > ${LOG_PATH}/tgi_service_start.log
+ if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+ break
+ fi
+ sleep 5s
+ n=$((n+1))
+ done
+
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 5s
+}
+
+function validate_microservices() {
+ # tgi for embedding service
+ validate_services \
+ "${ip_address}:8008/generate" \
+ "generated_text" \
+ "tgi" \
+ "codetrans-gaudi-tgi-service" \
+ '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+ # llm microservice
+ validate_services \
+ "${ip_address}:9000/v1/chat/completions" \
+ "data: " \
+ "llm" \
+ "codetrans-gaudi-llm-server" \
+ '{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
+
+}
+
+function validate_megaservice() {
+ # Curl the Mega Service
+ validate_services \
+ "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
+ "print" \
+ "mega-codetrans" \
+ "codetrans-gaudi-backend-server" \
+ '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
+
+ # test the megeservice via nginx
+ validate_services \
+ "${ip_address}:${NGINX_PORT}/v1/codetrans" \
+ "print" \
+ "mega-codetrans-nginx" \
+ "codetrans-gaudi-nginx-server" \
+ '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
+
+}
+
+function validate_frontend() {
+ cd $WORKPATH/ui/svelte
+ local conda_env_name="OPEA_e2e"
+ export PATH=${HOME}/miniforge3/bin/:$PATH
+ if conda info --envs | grep -q "$conda_env_name"; then
+ echo "$conda_env_name exist!"
+ else
+ conda create -n ${conda_env_name} python=3.12 -y
+ fi
+ source activate ${conda_env_name}
+
+ sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+ conda install -c conda-forge nodejs=22.6.0 -y
+ npm install && npm ci && npx playwright install --with-deps
+ node -v && npm -v && pip list
+
+ exit_status=0
+ npx playwright test || exit_status=$?
+
+ if [ $exit_status -ne 0 ]; then
+ echo "[TEST INFO]: ---------frontend test failed---------"
+ exit $exit_status
+ else
+ echo "[TEST INFO]: ---------frontend test passed---------"
+ fi
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/intel/hpu/gaudi/
+ docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+ stop_docker
+
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+ start_services
+
+ validate_microservices
+ validate_megaservice
+ validate_frontend
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
diff --git a/CodeTrans/tests/test_compose_tgi_on_xeon.sh b/CodeTrans/tests/test_compose_tgi_on_xeon.sh
new file mode 100644
index 0000000000..95154c7c9d
--- /dev/null
+++ b/CodeTrans/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="codetrans codetrans-ui llm-textgen nginx"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ export http_proxy=${http_proxy}
+ export https_proxy=${http_proxy}
+ export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
+ export LLM_ENDPOINT="http://${ip_address}:8008"
+ export LLM_COMPONENT_NAME="OpeaTextGenService"
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export MEGA_SERVICE_HOST_IP=${ip_address}
+ export LLM_SERVICE_HOST_IP=${ip_address}
+ export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
+ export FRONTEND_SERVICE_IP=${ip_address}
+ export FRONTEND_SERVICE_PORT=5173
+ export BACKEND_SERVICE_NAME=codetrans
+ export BACKEND_SERVICE_IP=${ip_address}
+ export BACKEND_SERVICE_PORT=7777
+ export NGINX_PORT=80
+ export host_ip=${ip_address}
+
+ sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+ # Start Docker Containers
+ docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+ n=0
+ until [[ "$n" -ge 100 ]]; do
+ docker logs codetrans-xeon-tgi-service > ${LOG_PATH}/tgi_service_start.log
+ if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+ break
+ fi
+ sleep 5s
+ n=$((n+1))
+ done
+
+ sleep 1m
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 5s
+}
+
+function validate_microservices() {
+ # tgi for embedding service
+ validate_services \
+ "${ip_address}:8008/generate" \
+ "generated_text" \
+ "tgi" \
+ "codetrans-xeon-tgi-service" \
+ '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+ # llm microservice
+ validate_services \
+ "${ip_address}:9000/v1/chat/completions" \
+ "data: " \
+ "llm" \
+ "codetrans-xeon-llm-server" \
+ '{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
+
+}
+
+function validate_megaservice() {
+ # Curl the Mega Service
+ validate_services \
+ "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
+ "print" \
+ "mega-codetrans" \
+ "codetrans-xeon-backend-server" \
+ '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
+
+ # test the megeservice via nginx
+ validate_services \
+ "${ip_address}:${NGINX_PORT}/v1/codetrans" \
+ "print" \
+ "mega-codetrans-nginx" \
+ "codetrans-xeon-nginx-server" \
+ '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
+
+}
+
+function validate_frontend() {
+ cd $WORKPATH/ui/svelte
+ local conda_env_name="OPEA_e2e"
+ export PATH=${HOME}/miniforge3/bin/:$PATH
+ if conda info --envs | grep -q "$conda_env_name"; then
+ echo "$conda_env_name exist!"
+ else
+ conda create -n ${conda_env_name} python=3.12 -y
+ fi
+ source activate ${conda_env_name}
+
+ sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+ conda install -c conda-forge nodejs=22.6.0 -y
+ npm install && npm ci && npx playwright install --with-deps
+ node -v && npm -v && pip list
+
+ exit_status=0
+ npx playwright test || exit_status=$?
+
+ if [ $exit_status -ne 0 ]; then
+ echo "[TEST INFO]: ---------frontend test failed---------"
+ exit $exit_status
+ else
+ echo "[TEST INFO]: ---------frontend test passed---------"
+ fi
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+ stop_docker
+
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+ start_services
+
+ validate_microservices
+ validate_megaservice
+ validate_frontend
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
From f6b63d1f26fde85e422a8c8d1b068ae1879d1d97 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Fri, 7 Mar 2025 11:00:48 +0800
Subject: [PATCH 047/226] Update model cache for AgentQnA (#1627)
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml | 2 +-
AgentQnA/tests/step2_start_retrieval_tool.sh | 2 +-
AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh | 2 +-
AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
index c1864ff374..e264411aef 100644
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "${AGENTQNA_TGI_SERVICE_PORT-8085}:80"
volumes:
- - /var/opea/agent-service/:/data
+ - ${HF_CACHE_DIR:-/var/opea/agent-service/}:/data
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/AgentQnA/tests/step2_start_retrieval_tool.sh b/AgentQnA/tests/step2_start_retrieval_tool.sh
index 91fb1ea0ae..c4b9a31fc7 100644
--- a/AgentQnA/tests/step2_start_retrieval_tool.sh
+++ b/AgentQnA/tests/step2_start_retrieval_tool.sh
@@ -9,7 +9,7 @@ echo "WORKDIR=${WORKDIR}"
export ip_address=$(hostname -I | awk '{print $1}')
export host_ip=${ip_address}
-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
if [ ! -d "$HF_CACHE_DIR" ]; then
echo "Creating HF_CACHE directory"
mkdir -p "$HF_CACHE_DIR"
diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
index 56f017239b..798f38526a 100644
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -13,7 +13,7 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
-export HF_CACHE_DIR=/data2/huggingface
+export HF_CACHE_DIR=${model_cache:-"/data2/huggingface"}
if [ ! -d "$HF_CACHE_DIR" ]; then
HF_CACHE_DIR=$WORKDIR/hf_cache
mkdir -p "$HF_CACHE_DIR"
diff --git a/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh b/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
index 5b90aa41f2..0e3e8d1697 100644
--- a/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
+++ b/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
@@ -11,7 +11,7 @@ export ip_address=$(hostname -I | awk '{print $1}')
export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
if [ ! -d "$HF_CACHE_DIR" ]; then
mkdir -p "$HF_CACHE_DIR"
fi
From e836b36750839cd8a21cfcd5381a2815acab59d1 Mon Sep 17 00:00:00 2001
From: Eero Tamminen
Date: Fri, 7 Mar 2025 07:13:29 +0200
Subject: [PATCH 048/226] Use GenAIComp base image to simplify Dockerfiles
(#1612)
Signed-off-by: Eero Tamminen
Signed-off-by: Chingis Yundunov
---
AudioQnA/Dockerfile | 44 ++---------------------------------
AudioQnA/Dockerfile.multilang | 44 ++---------------------------------
DocIndexRetriever/Dockerfile | 44 ++---------------------------------
EdgeCraftRAG/Dockerfile | 44 ++---------------------------------
FaqGen/Dockerfile | 44 ++---------------------------------
VideoQnA/Dockerfile | 44 ++---------------------------------
6 files changed, 12 insertions(+), 252 deletions(-)
diff --git a/AudioQnA/Dockerfile b/AudioQnA/Dockerfile
index 07245de371..1294c218ca 100644
--- a/AudioQnA/Dockerfile
+++ b/AudioQnA/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./audioqna.py $HOME/audioqna.py
diff --git a/AudioQnA/Dockerfile.multilang b/AudioQnA/Dockerfile.multilang
index 1d0573d217..997e4bed37 100644
--- a/AudioQnA/Dockerfile.multilang
+++ b/AudioQnA/Dockerfile.multilang
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./audioqna_multilang.py $HOME/audioqna_multilang.py
diff --git a/DocIndexRetriever/Dockerfile b/DocIndexRetriever/Dockerfile
index dcfd665f74..06fb1dc016 100644
--- a/DocIndexRetriever/Dockerfile
+++ b/DocIndexRetriever/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./retrieval_tool.py $HOME/retrieval_tool.py
diff --git a/EdgeCraftRAG/Dockerfile b/EdgeCraftRAG/Dockerfile
index fb7f5e14ec..fffb8d8970 100644
--- a/EdgeCraftRAG/Dockerfile
+++ b/EdgeCraftRAG/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./chatqna.py $HOME/chatqna.py
diff --git a/FaqGen/Dockerfile b/FaqGen/Dockerfile
index 2d1afd002a..d315bbb61b 100644
--- a/FaqGen/Dockerfile
+++ b/FaqGen/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./faqgen.py $HOME/faqgen.py
diff --git a/VideoQnA/Dockerfile b/VideoQnA/Dockerfile
index 0504a71881..2aade6088f 100644
--- a/VideoQnA/Dockerfile
+++ b/VideoQnA/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./videoqna.py $HOME/videoqna.py
From 48a6a0a3459f041ac359ecef7fa7b192fb61cb72 Mon Sep 17 00:00:00 2001
From: Shifani Rajabose
Date: Fri, 7 Mar 2025 01:31:34 -0500
Subject: [PATCH 049/226] [Bug: 112] Fix introduction in GenAIExamples main
README (#1631)
Signed-off-by: Chingis Yundunov
---
README.md | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 2db55575bd..369e504200 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,13 @@
## Introduction
-GenAIExamples are designed to give developers an easy entry into generative AI, featuring microservice-based samples that simplify the processes of deploying, testing, and scaling GenAI applications. All examples are fully compatible with Docker and Kubernetes, supporting a wide range of hardware platforms such as Gaudi, Xeon, and NVIDIA GPU, and other hardwares, ensuring flexibility and efficiency for your GenAI adoption.
+GenAIExamples are designed to give developers an easy entry into generative AI, featuring microservice-based samples that simplify the processes of deploying, testing, and scaling GenAI applications. All examples are fully compatible with both Docker and Kubernetes, supporting a wide range of hardware platforms such as Gaudi, Xeon, NVIDIA GPUs, and other hardwares including AMD GPUs, ensuring flexibility and efficiency for your GenAI adoption.
## Architecture
-[GenAIComps](https://github.com/opea-project/GenAIComps) is a service-based tool that includes microservice components such as llm, embedding, reranking, and so on. Using these components, various examples in GenAIExample can be constructed, including ChatQnA, DocSum, etc.
+[GenAIComps](https://github.com/opea-project/GenAIComps) is a service-based tool that includes microservice components such as llm, embedding, reranking, and so on. Using these components, various examples in GenAIExample can be constructed including ChatQnA, DocSum, etc.
-[GenAIInfra](https://github.com/opea-project/GenAIInfra), part of the OPEA containerization and cloud-native suite, enables quick and efficient deployment of GenAIExamples in the cloud.
+[GenAIInfra](https://github.com/opea-project/GenAIInfra) is part of the OPEA containerization and cloud-native suite and enables quick and efficient deployment of GenAIExamples in the cloud.
[GenAIEval](https://github.com/opea-project/GenAIEval) measures service performance metrics such as throughput, latency, and accuracy for GenAIExamples. This feature helps users compare performance across various hardware configurations easily.
@@ -18,18 +18,18 @@ The GenAIExamples [documentation](https://opea-project.github.io/latest/examples
## Getting Started
-GenAIExamples offers flexible deployment options that cater to different user needs, enabling efficient use and deployment in various environments. Here’s a brief overview of the three primary methods: Python startup, Docker Compose, and Kubernetes.
+GenAIExamples offers flexible deployment options that cater to different user needs, enabling efficient use and deployment in various environments. Three primary methods are presently used to do this: Python startup, Docker Compose, and Kubernetes.
Users can choose the most suitable approach based on ease of setup, scalability needs, and the environment in which they are operating.
### Deployment Guide
-Deployment is based on released docker images by default, check [docker image list](./docker_images_list.md) for detailed information. You can also build your own images following instructions.
+Deployment is based on released docker images by default - check [docker image list](./docker_images_list.md) for detailed information. You can also build your own images following instructions.
#### Prerequisite
-- For Docker Compose based deployment, you should have docker compose installed. Refer to [docker compose install](https://docs.docker.com/compose/install/).
-- For Kubernetes based deployment, you can use [Helm](https://helm.sh) or [GMC](https://github.com/opea-project/GenAIInfra/tree/main/microservices-connector/README.md) based deployment.
+- For Docker Compose-based deployment, you should have docker compose installed. Refer to [docker compose install](https://docs.docker.com/compose/install/) for more information.
+- For Kubernetes-based deployment, you can use [Helm](https://helm.sh) or [GMC](https://github.com/opea-project/GenAIInfra/tree/main/microservices-connector/README.md)-based deployment.
- You should have a kubernetes cluster ready for use. If not, you can refer to [k8s install](https://github.com/opea-project/docs/tree/main/guide/installation/k8s_install/README.md) to deploy one.
- (Optional) You should have Helm (version >= 3.15) installed if you want to deploy with Helm Charts. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
@@ -37,7 +37,7 @@ Deployment is based on released docker images by default, check [docker image li
- Recommended Hardware Reference
- Based on different deployment model size and performance requirement, you may choose different hardware platforms or cloud instances. Here are some reference platforms
+ Based on different deployment model sizes and performance requirements, you may choose different hardware platforms or cloud instances. Here are some of the reference platforms:
| Use Case | Deployment model | Reference Configuration | Hardware access/instances |
| -------- | ------------------------- | -------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
@@ -47,7 +47,7 @@ Deployment is based on released docker images by default, check [docker image li
#### Deploy Examples
-> **Note**: Check for [sample guides](https://opea-project.github.io/latest/examples/index.html) first for your use case. If it is not available, then refer to the table below.
+> **Note**: Check for [sample guides](https://opea-project.github.io/latest/examples/index.html) first for your use case. If it is not available, then refer to the table below:
| Use Case | Docker Compose
Deployment on Xeon | Docker Compose
Deployment on Gaudi | Docker Compose
Deployment on ROCm | Kubernetes with Helm Charts | Kubernetes with GMC |
| ----------------- | ------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------------- | ------------------------------------------------------------ |
From 48ee4c4b3a3fc57e76cb2c1c4bf73cb2120443fb Mon Sep 17 00:00:00 2001
From: "chen, suyue"
Date: Fri, 7 Mar 2025 15:05:08 +0800
Subject: [PATCH 050/226] Fix corner CI issue when the example path deleted
(#1634)
Signed-off-by: chensuyue
Signed-off-by: Chingis Yundunov
---
.github/workflows/scripts/get_test_matrix.sh | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/scripts/get_test_matrix.sh b/.github/workflows/scripts/get_test_matrix.sh
index 2d6efddd24..5ad6992104 100644
--- a/.github/workflows/scripts/get_test_matrix.sh
+++ b/.github/workflows/scripts/get_test_matrix.sh
@@ -12,6 +12,7 @@ run_matrix="{\"include\":["
examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u)
for example in ${examples}; do
+ if [[ ! -d $WORKSPACE/$example ]]; then continue; fi
cd $WORKSPACE/$example
if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi
cd tests
From 7b7824770c793e237fda7b34f3e13f6e3ba784de Mon Sep 17 00:00:00 2001
From: wangleflex <106506636+wangleflex@users.noreply.github.com>
Date: Fri, 7 Mar 2025 17:08:53 +0800
Subject: [PATCH 051/226] [ChatQnA] Show spinner after query to improve user
experience (#1003) (#1628)
Signed-off-by: Wang,Le3
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
.../shared/components/loading/Spinner.svelte | 68 +++++++++++++++++++
ChatQnA/ui/svelte/src/routes/+page.svelte | 11 ++-
2 files changed, 77 insertions(+), 2 deletions(-)
create mode 100644 ChatQnA/ui/svelte/src/lib/shared/components/loading/Spinner.svelte
diff --git a/ChatQnA/ui/svelte/src/lib/shared/components/loading/Spinner.svelte b/ChatQnA/ui/svelte/src/lib/shared/components/loading/Spinner.svelte
new file mode 100644
index 0000000000..1b0a086ad1
--- /dev/null
+++ b/ChatQnA/ui/svelte/src/lib/shared/components/loading/Spinner.svelte
@@ -0,0 +1,68 @@
+
+
+
+
+
+
+
+
+
diff --git a/ChatQnA/ui/svelte/src/routes/+page.svelte b/ChatQnA/ui/svelte/src/routes/+page.svelte
index b6f6d9c334..bcd0b8b708 100644
--- a/ChatQnA/ui/svelte/src/routes/+page.svelte
+++ b/ChatQnA/ui/svelte/src/routes/+page.svelte
@@ -39,6 +39,8 @@
import ChatMessage from "$lib/modules/chat/ChatMessage.svelte";
import { fetchAllFile } from "$lib/network/upload/Network.js";
import { getNotificationsContext } from "svelte-notifications";
+ import Spinner from "$lib/shared/components/loading/Spinner.svelte";
+
let query: string = "";
let loading: boolean = false;
@@ -241,8 +243,13 @@
type="submit"
id="send"
class="absolute bottom-2.5 end-2.5 px-4 py-2 text-sm font-medium text-white dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800"
- >
+ >
+ {#if loading}
+
+ {:else}
+
+ {/if}
+
From 6fbe02dc242f213cf123bbb04f213c3a1805d607 Mon Sep 17 00:00:00 2001
From: "chen, suyue"
Date: Fri, 7 Mar 2025 20:40:32 +0800
Subject: [PATCH 052/226] Use the latest HabanaAI/vllm-fork release tag to
build vllm-gaudi image (#1635)
Signed-off-by: chensuyue
Co-authored-by: Liang Lv
Signed-off-by: Chingis Yundunov
---
.github/workflows/_example-workflow.yml | 14 ++++++++------
AgentQnA/tests/step1_build_images.sh | 3 ++-
ChatQnA/tests/test_compose_guardrails_on_gaudi.sh | 4 +++-
ChatQnA/tests/test_compose_on_gaudi.sh | 4 +++-
.../tests/test_compose_without_rerank_on_gaudi.sh | 4 +++-
CodeTrans/tests/test_compose_on_gaudi.sh | 4 +++-
6 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
index 010eece64a..f3b717a284 100644
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -78,16 +78,18 @@ jobs:
cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
- git clone https://github.com/vllm-project/vllm.git && cd vllm
+ git clone https://github.com/vllm-project/vllm.git && cd vllm
# Get the latest tag
- VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
echo "Check out vLLM tag ${VLLM_VER}"
- git checkout ${VLLM_VER} &> /dev/null
- # make sure do not change the pwd
- git rev-parse HEAD && cd ../
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
fi
if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
- git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+ git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+ # Get the latest tag
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
fi
git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
cd GenAIComps && git rev-parse HEAD && cd ../
diff --git a/AgentQnA/tests/step1_build_images.sh b/AgentQnA/tests/step1_build_images.sh
index 4cb8a2e4d1..aa83521448 100644
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -42,7 +42,8 @@ function build_vllm_docker_image() {
git clone https://github.com/HabanaAI/vllm-fork.git
fi
cd ./vllm-fork
- git checkout v0.6.4.post2+Gaudi-1.19.0
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ git checkout ${VLLM_VER} &> /dev/null
docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
if [ $? -ne 0 ]; then
echo "opea/vllm-gaudi:ci failed"
diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
index d667a89f3c..c882a7ef77 100644
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -29,7 +29,9 @@ function build_docker_images() {
fi
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
- git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+ git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="chatqna-guardrails chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index 8858900148..7f64e3b0d6 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -29,7 +29,9 @@ function build_docker_images() {
fi
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
- git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+ git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi nginx"
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
index 9e9d7df735..c9dc86a0bd 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -29,7 +29,9 @@ function build_docker_images() {
fi
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
- git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+ git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="chatqna-without-rerank chatqna-ui dataprep retriever vllm-gaudi nginx"
diff --git a/CodeTrans/tests/test_compose_on_gaudi.sh b/CodeTrans/tests/test_compose_on_gaudi.sh
index 9c78ea5972..39bf472521 100644
--- a/CodeTrans/tests/test_compose_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_on_gaudi.sh
@@ -30,7 +30,9 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
- git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+ git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="codetrans codetrans-ui llm-textgen vllm-gaudi nginx"
From 06dab2106ce1cae6f041561c1e55ef15dabf5b85 Mon Sep 17 00:00:00 2001
From: XinyaoWa
Date: Mon, 10 Mar 2025 09:39:35 +0800
Subject: [PATCH 053/226] Set vLLM as default model for FaqGen (#1580)
Signed-off-by: Xinyao Wang
Signed-off-by: Chingis Yundunov
---
.../docker_compose/intel/cpu/xeon/README.md | 92 +++++----
.../intel/cpu/xeon/compose.yaml | 21 +-
.../intel/cpu/xeon/compose_tgi.yaml | 78 +++++++
.../docker_compose/intel/hpu/gaudi/README.md | 66 +++---
.../intel/hpu/gaudi/compose.yaml | 34 ++--
.../intel/hpu/gaudi/compose_tgi.yaml | 94 +++++++++
FaqGen/tests/test_compose_on_gaudi.sh | 44 ++--
FaqGen/tests/test_compose_on_xeon.sh | 43 ++--
FaqGen/tests/test_compose_tgi_on_gaudi.sh | 192 ++++++++++++++++++
FaqGen/tests/test_compose_tgi_on_xeon.sh | 190 +++++++++++++++++
10 files changed, 726 insertions(+), 128 deletions(-)
create mode 100644 FaqGen/docker_compose/intel/cpu/xeon/compose_tgi.yaml
create mode 100644 FaqGen/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
create mode 100644 FaqGen/tests/test_compose_tgi_on_gaudi.sh
create mode 100755 FaqGen/tests/test_compose_tgi_on_xeon.sh
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/README.md b/FaqGen/docker_compose/intel/cpu/xeon/README.md
index a961a6aa98..576c5724ec 100644
--- a/FaqGen/docker_compose/intel/cpu/xeon/README.md
+++ b/FaqGen/docker_compose/intel/cpu/xeon/README.md
@@ -14,7 +14,17 @@ After launching your instance, you can connect to it using SSH (for Linux instan
First of all, you need to build Docker Images locally. This step can be ignored once the Docker images are published to Docker hub.
-### 1. Build LLM Image
+### 1. Build vLLM Image
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd ./vllm/
+VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
+```
+
+### 2. Build LLM Image
```bash
git clone https://github.com/opea-project/GenAIComps.git
@@ -22,7 +32,7 @@ cd GenAIComps
docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
```
-### 2. Build MegaService Docker Image
+### 3. Build MegaService Docker Image
To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `faqgen.py` Python script. Build the MegaService Docker image via below command:
@@ -32,7 +42,7 @@ cd GenAIExamples/FaqGen/
docker build --no-cache -t opea/faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f GenAIExamples/FaqGen/Dockerfile .
```
-### 3. Build UI Docker Image
+### 4. Build UI Docker Image
Build the frontend Docker image via below command:
@@ -41,7 +51,7 @@ cd GenAIExamples/FaqGen/ui
docker build -t opea/faqgen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
```
-### 4. Build react UI Docker Image (Optional)
+### 5. Build react UI Docker Image (Optional)
Build the frontend Docker image based on react framework via below command:
@@ -53,10 +63,11 @@ docker build -t opea/faqgen-react-ui:latest --build-arg https_proxy=$https_proxy
Then run the command `docker images`, you will have the following Docker Images:
-1. `opea/llm-faqgen:latest`
-2. `opea/faqgen:latest`
-3. `opea/faqgen-ui:latest`
-4. `opea/faqgen-react-ui:latest`
+1. `opea/vllm:latest`
+2. `opea/llm-faqgen:latest`
+3. `opea/faqgen:latest`
+4. `opea/faqgen-ui:latest`
+5. `opea/faqgen-react-ui:latest`
## 🚀 Start Microservices and MegaService
@@ -77,7 +88,8 @@ export https_proxy=${your_http_proxy}
export host_ip=${your_host_ip}
export LLM_ENDPOINT_PORT=8008
export LLM_SERVICE_PORT=9000
-export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
+export FAQGEN_BACKEND_PORT=8888
+export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
export MEGA_SERVICE_HOST_IP=${host_ip}
@@ -97,44 +109,44 @@ docker compose up -d
### Validate Microservices
-1. TGI Service
+1. vLLM Service
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
+```bash
+curl http://${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions \
+ -X POST \
+ -H "Content-Type: application/json" \
+ -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
+```
2. LLM Microservice
- ```bash
- curl http://${host_ip}:9000/v1/faqgen \
- -X POST \
- -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
- -H 'Content-Type: application/json'
- ```
+```bash
+curl http://${host_ip}:${LLM_SERVICE_PORT}/v1/faqgen \
+ -X POST \
+ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+ -H 'Content-Type: application/json'
+```
3. MegaService
- ```bash
- curl http://${host_ip}:8888/v1/faqgen \
- -H "Content-Type: multipart/form-data" \
- -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
- -F "max_tokens=32" \
- -F "stream=False"
- ```
-
- ```bash
- ## enable stream
- curl http://${host_ip}:8888/v1/faqgen \
- -H "Content-Type: multipart/form-data" \
- -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
- -F "max_tokens=32" \
- -F "stream=True"
- ```
-
- Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service.
+```bash
+curl http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen \
+ -H "Content-Type: multipart/form-data" \
+ -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
+ -F "max_tokens=32" \
+ -F "stream=False"
+```
+
+```bash
+## enable stream
+curl http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen \
+ -H "Content-Type: multipart/form-data" \
+ -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
+ -F "max_tokens=32" \
+ -F "stream=True"
+```
+
+Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service.
## 🚀 Launch the UI
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
index ca86a18f2d..7da122f9ab 100644
--- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
services:
- tgi-service:
- image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
- container_name: tgi-xeon-server
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ container_name: vllm-service
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
@@ -14,20 +14,23 @@ services:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${HF_TOKEN}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "${VLLM_TORCH_PROFILER_DIR:-/mnt}"
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+ VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
- command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
llm_faqgen:
image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
container_name: llm-faqgen-server
depends_on:
- tgi-service:
+ vllm-service:
condition: service_healthy
ports:
- ${LLM_SERVICE_PORT:-9000}:9000
@@ -39,17 +42,17 @@ services:
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
+ FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenvLLM}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
faqgen-xeon-backend-server:
image: ${REGISTRY:-opea}/faqgen:${TAG:-latest}
container_name: faqgen-xeon-backend-server
depends_on:
- - tgi-service
+ - vllm-service
- llm_faqgen
ports:
- - "8888:8888"
+ - ${FAQGEN_BACKEND_PORT:-8888}:8888
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose_tgi.yaml
new file mode 100644
index 0000000000..b900331ad8
--- /dev/null
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,78 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ tgi-service:
+ image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+ container_name: tgi-xeon-server
+ ports:
+ - ${LLM_ENDPOINT_PORT:-8008}:80
+ volumes:
+ - "${MODEL_CACHE:-./data}:/data"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ host_ip: ${host_ip}
+ LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ llm_faqgen:
+ image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+ container_name: llm-faqgen-server
+ depends_on:
+ tgi-service:
+ condition: service_healthy
+ ports:
+ - ${LLM_SERVICE_PORT:-9000}:9000
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenTgi}
+ LOGFLAG: ${LOGFLAG:-False}
+ restart: unless-stopped
+ faqgen-xeon-backend-server:
+ image: ${REGISTRY:-opea}/faqgen:${TAG:-latest}
+ container_name: faqgen-xeon-backend-server
+ depends_on:
+ - tgi-service
+ - llm_faqgen
+ ports:
+ - ${FAQGEN_BACKEND_PORT:-8888}:8888
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+ - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+ - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
+ ipc: host
+ restart: always
+ faqgen-xeon-ui-server:
+ image: ${REGISTRY:-opea}/faqgen-ui:${TAG:-latest}
+ container_name: faqgen-xeon-ui-server
+ depends_on:
+ - faqgen-xeon-backend-server
+ ports:
+ - "5173:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - FAQ_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+networks:
+ default:
+ driver: bridge
diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/README.md b/FaqGen/docker_compose/intel/hpu/gaudi/README.md
index 7364e92387..4f8793d4ac 100644
--- a/FaqGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/FaqGen/docker_compose/intel/hpu/gaudi/README.md
@@ -6,7 +6,7 @@ This document outlines the deployment process for a FAQ Generation application u
1. Set up the environment variables.
2. Run Docker Compose.
-3. Consume the ChatQnA Service.
+3. Consume the FaqGen Service.
### Quick Start: 1.Setup Environment Variable
@@ -32,12 +32,14 @@ To set up environment variables for deploying ChatQnA services, follow these ste
3. Set up other environment variables:
```bash
- export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
- export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
export MEGA_SERVICE_HOST_IP=${host_ip}
export LLM_SERVICE_HOST_IP=${host_ip}
+ export LLM_ENDPOINT_PORT=8008
export LLM_SERVICE_PORT=9000
- export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/faqgen"
+ export FAQGEN_BACKEND_PORT=8888
+ export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+ export vLLM_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+ export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
```
### Quick Start: 2.Run Docker Compose
@@ -50,7 +52,7 @@ It will automatically download the docker image on `docker hub`, please check th
```bash
docker ps -a
- docker logs tgi-gaudi-server -t
+ docker logs vllm-gaudi-service -t
```
it may take some time to download the model.
@@ -65,32 +67,33 @@ Please refer to 'Build Docker Images' in below.
### QuickStart: 3.Consume the Service
```bash
-curl localhost:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
+curl http://localhost:${LLM_ENDPOINT_PORT}/v1/chat/completions \
+ -X POST \
+ -H "Content-Type: application/json" \
+ -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
```
here we just test the service on the host machine for a quick start.
If all networks work fine, please try
```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
+curl http://${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions \
+ -X POST \
+ -H "Content-Type: application/json" \
+ -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
```
## 🚀 Build Docker Images
First of all, you need to build Docker Images locally. This step can be ignored once the Docker images are published to Docker hub.
-### 1. Pull TGI Gaudi Image
-
-As TGI Gaudi has been officially published as a Docker image, we simply need to pull it:
+### 1. Build vLLM Image
```bash
-docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork/
+git checkout v0.6.4.post2+Gaudi-1.19.0
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .
```
### 2. Build LLM Image
@@ -126,13 +129,13 @@ Build the frontend Docker image based on react framework via below command:
```bash
cd GenAIExamples/FaqGen/ui
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/faqgen"
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
docker build -t opea/faqgen-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f docker/Dockerfile.react .
```
Then run the command `docker images`, you will have the following Docker Images:
-1. `ghcr.io/huggingface/tgi-gaudi:2.0.6`
+1. `opea/vllm-gaudi:latest`
2. `opea/llm-faqgen:latest`
3. `opea/faqgen:latest`
4. `opea/faqgen-ui:latest`
@@ -157,13 +160,14 @@ export https_proxy=${your_http_proxy}
export host_ip=${your_host_ip}
export LLM_ENDPOINT_PORT=8008
export LLM_SERVICE_PORT=9000
-export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
+export FAQGEN_BACKEND_PORT=8888
+export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
export MEGA_SERVICE_HOST_IP=${host_ip}
export LLM_SERVICE_HOST_IP=${host_ip}
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/faqgen"
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
```
Note: Please replace with `host_ip` with your external IP address, do not use localhost.
@@ -177,19 +181,19 @@ docker compose up -d
### Validate Microservices
-1. TGI Service
+1.vLLM Service
- ```bash
- curl http://${host_ip}:8008/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
- -H 'Content-Type: application/json'
- ```
+ ```bash
+ curl http://${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions \
+ -X POST \
+ -H "Content-Type: application/json" \
+ -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
+ ```
2. LLM Microservice
```bash
- curl http://${host_ip}:9000/v1/faqgen \
+ curl http://${host_ip}:${LLM_SERVICE_PORT}/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-H 'Content-Type: application/json'
@@ -198,7 +202,7 @@ docker compose up -d
3. MegaService
```bash
- curl http://${host_ip}:8888/v1/faqgen \
+ curl http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen \
-H "Content-Type: multipart/form-data" \
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
-F "max_tokens=32" \
@@ -207,7 +211,7 @@ docker compose up -d
```bash
##enable stream
- curl http://${host_ip}:8888/v1/faqgen \
+ curl http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen \
-H "Content-Type: multipart/form-data" \
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
-F "max_tokens=32" \
diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
index 90503069c1..fbc8812d58 100644
--- a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,30 +2,26 @@
# SPDX-License-Identifier: Apache-2.0
services:
- tgi-service:
- image: ghcr.io/huggingface/tgi-gaudi:2.3.1
- container_name: tgi-gaudi-server
+ vllm-gaudi-service:
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+ container_name: vllm-gaudi-service
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "${DATA_PATH:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- HF_HUB_DISABLE_PROGRESS_BARS: 1
- HF_HUB_ENABLE_HF_TRANSFER: 0
+ HF_TOKEN: ${HF_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
- PREFILL_BATCH_BUCKET_SIZE: 1
- BATCH_BUCKET_SIZE: 8
- ENABLE_HPU_GRAPH: true
- LIMIT_HPU_GRAPH: true
- USE_FLASH_ATTENTION: true
- FLASH_ATTENTION_RECOMPUTE: true
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+ VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+ NUM_CARDS: ${NUM_CARDS:-1}
runtime: habana
cap_add:
- SYS_NICE
@@ -34,13 +30,13 @@ services:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
- retries: 100
- command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 4096
+ retries: 150
+ command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256
llm_faqgen:
image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
container_name: llm-faqgen-server
depends_on:
- tgi-service:
+ vllm-gaudi-service:
condition: service_healthy
ports:
- ${LLM_SERVICE_PORT:-9000}:9000
@@ -52,17 +48,17 @@ services:
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
+ FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenvLLM}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
faqgen-gaudi-backend-server:
image: ${REGISTRY:-opea}/faqgen:${TAG:-latest}
container_name: faqgen-gaudi-backend-server
depends_on:
- - tgi-service
+ - vllm-gaudi-service
- llm_faqgen
ports:
- - "8888:8888"
+ - ${FAQGEN_BACKEND_PORT:-8888}:8888
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/FaqGen/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
new file mode 100644
index 0000000000..082321583b
--- /dev/null
+++ b/FaqGen/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,94 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ tgi-service:
+ image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+ container_name: tgi-gaudi-server
+ ports:
+ - ${LLM_ENDPOINT_PORT:-8008}:80
+ volumes:
+ - "${MODEL_CACHE:-./data}:/data"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ HABANA_VISIBLE_DEVICES: all
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ PREFILL_BATCH_BUCKET_SIZE: 1
+ BATCH_BUCKET_SIZE: 8
+ ENABLE_HPU_GRAPH: true
+ LIMIT_HPU_GRAPH: true
+ USE_FLASH_ATTENTION: true
+ FLASH_ATTENTION_RECOMPUTE: true
+ host_ip: ${host_ip}
+ LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+ MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS:-4096}
+ MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS:-8192}
+ runtime: habana
+ cap_add:
+ - SYS_NICE
+ ipc: host
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model-id ${LLM_MODEL_ID} --max-input-tokens ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS} --max-batch-total-tokens 65536 --max-batch-prefill-tokens 4096
+ llm_faqgen:
+ image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+ container_name: llm-faqgen-server
+ depends_on:
+ tgi-service:
+ condition: service_healthy
+ ports:
+ - ${LLM_SERVICE_PORT:-9000}:9000
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenTgi}
+ LOGFLAG: ${LOGFLAG:-False}
+ restart: unless-stopped
+ faqgen-gaudi-backend-server:
+ image: ${REGISTRY:-opea}/faqgen:${TAG:-latest}
+ container_name: faqgen-gaudi-backend-server
+ depends_on:
+ - tgi-service
+ - llm_faqgen
+ ports:
+ - ${FAQGEN_BACKEND_PORT:-8888}:8888
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+ - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+ - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
+ ipc: host
+ restart: always
+ faqgen-gaudi-ui-server:
+ image: ${REGISTRY:-opea}/faqgen-ui:${TAG:-latest}
+ container_name: faqgen-gaudi-ui-server
+ depends_on:
+ - faqgen-gaudi-backend-server
+ ports:
+ - "5173:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - FAQ_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/FaqGen/tests/test_compose_on_gaudi.sh b/FaqGen/tests/test_compose_on_gaudi.sh
index eeba304279..125e71a8e4 100644
--- a/FaqGen/tests/test_compose_on_gaudi.sh
+++ b/FaqGen/tests/test_compose_on_gaudi.sh
@@ -13,9 +13,21 @@ export TAG=${IMAGE_TAG}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
-export DATA_PATH=${model_cache:-"/data/cache"}
+export MODEL_CACHE=${model_cache:-"/data/cache"}
function build_docker_images() {
+ cd $WORKPATH
+ git clone https://github.com/HabanaAI/vllm-fork.git
+ cd vllm-fork/
+ git checkout v0.6.4.post2+Gaudi-1.19.0
+ docker build --no-cache -f Dockerfile.hpu -t ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} --shm-size=128g .
+ if [ $? -ne 0 ]; then
+ echo "opea/vllm-gaudi built fail"
+ exit 1
+ else
+ echo "opea/vllm-gaudi built successful"
+ fi
+
opea_branch=${opea_branch:-"main"}
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
if [[ "${opea_branch}" != "main" ]]; then
@@ -35,7 +47,6 @@ function build_docker_images() {
service_list="faqgen faqgen-ui llm-faqgen"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
@@ -43,15 +54,17 @@ function start_services() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
export host_ip=${ip_address}
- export LLM_ENDPOINT_PORT=8008
- export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
- export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+ export LLM_ENDPOINT_PORT=8010
+ export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
+ export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export MEGA_SERVICE_HOST_IP=${ip_address}
export LLM_SERVICE_HOST_IP=${ip_address}
- export LLM_SERVICE_PORT=9000
- export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/faqgen"
+ export LLM_SERVICE_PORT=9001
+ export FAQGEN_BACKEND_PORT=8888
+ export NUM_CARDS=1
+ export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
export LOGFLAG=True
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
@@ -93,17 +106,18 @@ function validate_services() {
function validate_microservices() {
# Check if the microservices are running correctly.
- # tgi for llm service
+ # vllm
+ echo "Validate vllm..."
validate_services \
- "${ip_address}:8008/generate" \
- "generated_text" \
- "tgi-service" \
- "tgi-gaudi-server" \
- '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+ "http://${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+ "text" \
+ "vllm-gaudi-service" \
+ "vllm-gaudi-service" \
+ '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
# llm microservice
validate_services \
- "${ip_address}:9000/v1/faqgen" \
+ "${ip_address}:${LLM_SERVICE_PORT}/v1/faqgen" \
"text" \
"llm" \
"llm-faqgen-server" \
@@ -115,7 +129,7 @@ function validate_megaservice() {
local DOCKER_NAME="faqgen-gaudi-backend-server"
local EXPECTED_RESULT="Embeddings"
local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${ip_address}:8888/v1/faqgen"
+ local URL="${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL")
if [ "$HTTP_STATUS" -eq 200 ]; then
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
diff --git a/FaqGen/tests/test_compose_on_xeon.sh b/FaqGen/tests/test_compose_on_xeon.sh
index cc527b7e9d..fb859ebe04 100755
--- a/FaqGen/tests/test_compose_on_xeon.sh
+++ b/FaqGen/tests/test_compose_on_xeon.sh
@@ -16,6 +16,20 @@ LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
function build_docker_images() {
+ cd $WORKPATH
+ git clone https://github.com/vllm-project/vllm.git
+ cd ./vllm/
+ VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null
+ docker build --no-cache -f Dockerfile.cpu -t ${REGISTRY:-opea}/vllm:${TAG:-latest} --shm-size=128g .
+ if [ $? -ne 0 ]; then
+ echo "opea/vllm built fail"
+ exit 1
+ else
+ echo "opea/vllm built successful"
+ fi
+
opea_branch=${opea_branch:-"main"}
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
if [[ "${opea_branch}" != "main" ]]; then
@@ -35,7 +49,6 @@ function build_docker_images() {
service_list="faqgen faqgen-ui llm-faqgen"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/text-generation-inference:1.4
docker images && sleep 1s
}
@@ -43,15 +56,16 @@ function start_services() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
export host_ip=${ip_address}
- export LLM_ENDPOINT_PORT=8008
- export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
- export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+ export LLM_ENDPOINT_PORT=8011
+ export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
+ export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export MEGA_SERVICE_HOST_IP=${ip_address}
export LLM_SERVICE_HOST_IP=${ip_address}
- export LLM_SERVICE_PORT=9000
- export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/faqgen"
+ export LLM_SERVICE_PORT=9002
+ export FAQGEN_BACKEND_PORT=8888
+ export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
export LOGFLAG=True
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
@@ -93,17 +107,18 @@ function validate_services() {
function validate_microservices() {
# Check if the microservices are running correctly.
- # tgi for llm service
+ # vllm for llm service
+ echo "Validate vllm..."
validate_services \
- "${ip_address}:8008/generate" \
- "generated_text" \
- "tgi-service" \
- "tgi-xeon-server" \
- '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+ "http://${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+ "text" \
+ "vllm-service" \
+ "vllm-service" \
+ '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
# llm microservice
validate_services \
- "${ip_address}:9000/v1/faqgen" \
+ "${ip_address}:${LLM_SERVICE_PORT}/v1/faqgen" \
"text" \
"llm" \
"llm-faqgen-server" \
@@ -115,7 +130,7 @@ function validate_megaservice() {
local DOCKER_NAME="faqgen-xeon-backend-server"
local EXPECTED_RESULT="Embeddings"
local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
- local URL="${ip_address}:8888/v1/faqgen"
+ local URL="${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL")
if [ "$HTTP_STATUS" -eq 200 ]; then
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
diff --git a/FaqGen/tests/test_compose_tgi_on_gaudi.sh b/FaqGen/tests/test_compose_tgi_on_gaudi.sh
new file mode 100644
index 0000000000..1c596322b4
--- /dev/null
+++ b/FaqGen/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MODEL_CACHE=${model_cache:-"/data/cache"}
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="faqgen faqgen-ui llm-faqgen"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/hpu/gaudi
+
+ export host_ip=${ip_address}
+ export LLM_ENDPOINT_PORT=8009
+ export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
+ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export MEGA_SERVICE_HOST_IP=${ip_address}
+ export LLM_SERVICE_HOST_IP=${ip_address}
+ export LLM_SERVICE_PORT=9001
+ export MAX_INPUT_TOKENS=4096
+ export MAX_TOTAL_TOKENS=8192
+ export FAQGEN_BACKEND_PORT=8889
+ export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
+ export LOGFLAG=True
+
+ sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+ # Start Docker Containers
+ docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+ sleep 30s
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # tgi for llm service
+ validate_services \
+ "${ip_address}:${LLM_ENDPOINT_PORT}/generate" \
+ "generated_text" \
+ "tgi-service" \
+ "tgi-gaudi-server" \
+ '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+ # llm microservice
+ validate_services \
+ "${ip_address}:${LLM_SERVICE_PORT}/v1/faqgen" \
+ "text" \
+ "llm" \
+ "llm-faqgen-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="mega-faqgen"
+ local DOCKER_NAME="faqgen-gaudi-backend-server"
+ local EXPECTED_RESULT="Embeddings"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL")
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_frontend() {
+ cd $WORKPATH/ui/svelte
+ local conda_env_name="OPEA_e2e"
+ export PATH=${HOME}/miniforge3/bin/:$PATH
+ if conda info --envs | grep -q "$conda_env_name"; then
+ echo "$conda_env_name exist!"
+ else
+ conda create -n ${conda_env_name} python=3.12 -y
+ fi
+ source activate ${conda_env_name}
+
+ sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+ conda install -c conda-forge nodejs=22.6.0 -y
+ npm install && npm ci && npx playwright install --with-deps
+ node -v && npm -v && pip list
+
+ exit_status=0
+ npx playwright test || exit_status=$?
+
+ if [ $exit_status -ne 0 ]; then
+ echo "[TEST INFO]: ---------frontend test failed---------"
+ exit $exit_status
+ else
+ echo "[TEST INFO]: ---------frontend test passed---------"
+ fi
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/intel/hpu/gaudi
+ docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+ stop_docker
+
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+ start_services
+
+ validate_microservices
+ validate_megaservice
+ # validate_frontend
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
diff --git a/FaqGen/tests/test_compose_tgi_on_xeon.sh b/FaqGen/tests/test_compose_tgi_on_xeon.sh
new file mode 100755
index 0000000000..9676288a63
--- /dev/null
+++ b/FaqGen/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export MODEL_CACHE=${model_cache:-"./data"}
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="faqgen faqgen-ui llm-faqgen"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/text-generation-inference:1.4
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+
+ export host_ip=${ip_address}
+ export LLM_ENDPOINT_PORT=8009
+ export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
+ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export MEGA_SERVICE_HOST_IP=${ip_address}
+ export LLM_SERVICE_HOST_IP=${ip_address}
+ export LLM_SERVICE_PORT=9001
+ export FAQGEN_BACKEND_PORT=8889
+ export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
+ export LOGFLAG=True
+
+ sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+ # Start Docker Containers
+ docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+ sleep 30s
+}
+
+function validate_services() {
+ local URL="$1"
+ local EXPECTED_RESULT="$2"
+ local SERVICE_NAME="$3"
+ local DOCKER_NAME="$4"
+ local INPUT_DATA="$5"
+
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_microservices() {
+ # Check if the microservices are running correctly.
+
+ # tgi for llm service
+ validate_services \
+ "${ip_address}:${LLM_ENDPOINT_PORT}/generate" \
+ "generated_text" \
+ "tgi-service" \
+ "tgi-xeon-server" \
+ '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+ # llm microservice
+ validate_services \
+ "${ip_address}:${LLM_SERVICE_PORT}/v1/faqgen" \
+ "text" \
+ "llm" \
+ "llm-faqgen-server" \
+ '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+}
+
+function validate_megaservice() {
+ local SERVICE_NAME="mega-faqgen"
+ local DOCKER_NAME="faqgen-xeon-backend-server"
+ local EXPECTED_RESULT="Embeddings"
+ local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+ local URL="${ip_address}:${FAQGEN_BACKEND_PORT}/v1/faqgen"
+ local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL")
+ if [ "$HTTP_STATUS" -eq 200 ]; then
+ echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+ local CONTENT=$(curl -s -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+ if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+ echo "[ $SERVICE_NAME ] Content is as expected."
+ else
+ echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ else
+ echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+ docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+ exit 1
+ fi
+ sleep 1s
+}
+
+function validate_frontend() {
+ cd $WORKPATH/ui/svelte
+ local conda_env_name="OPEA_e2e"
+ export PATH=${HOME}/miniforge3/bin/:$PATH
+ if conda info --envs | grep -q "$conda_env_name"; then
+ echo "$conda_env_name exist!"
+ else
+ conda create -n ${conda_env_name} python=3.12 -y
+ fi
+ source activate ${conda_env_name}
+
+ sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+ conda install -c conda-forge nodejs=22.6.0 -y
+ npm install && npm ci && npx playwright install --with-deps
+ node -v && npm -v && pip list
+
+ exit_status=0
+ npx playwright test || exit_status=$?
+
+ if [ $exit_status -ne 0 ]; then
+ echo "[TEST INFO]: ---------frontend test failed---------"
+ exit $exit_status
+ else
+ echo "[TEST INFO]: ---------frontend test passed---------"
+ fi
+}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+ stop_docker
+
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+ start_services
+
+ validate_microservices
+ validate_megaservice
+ # validate_frontend
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
From a6d6f1f6c02005578e544fd76f0c480459f79bcd Mon Sep 17 00:00:00 2001
From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com>
Date: Mon, 10 Mar 2025 13:40:42 +0800
Subject: [PATCH 054/226] Fix vllm model cache directory (#1642)
Signed-off-by: Wang, Kai Lawrence
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/intel/cpu/xeon/README.md | 4 ++--
ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md | 4 ++--
ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml | 2 +-
ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml | 2 +-
.../docker_compose/intel/cpu/xeon/compose_without_rerank.yaml | 2 +-
CodeTrans/docker_compose/intel/cpu/xeon/README.md | 4 ++--
CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
CodeTrans/docker_compose/intel/hpu/gaudi/README.md | 4 ++--
CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
FaqGen/docker_compose/intel/cpu/xeon/compose.yaml | 2 +-
FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
12 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index c71a866cf5..f8475e94d0 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -219,7 +219,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_ENDPOINT="https://hf-mirror.com"
model_name="meta-llama/Meta-Llama-3-8B-Instruct"
# Start vLLM LLM Service
- docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
# Start TGI LLM Service
docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
```
@@ -236,7 +236,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_TOKEN=${your_hf_token}
export model_path="/path/to/model"
# Start vLLM LLM Service
- docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
# Start TGI LLM Service
docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
```
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
index 8e8a9cd441..4d127be19a 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
@@ -201,7 +201,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_TOKEN=${your_hf_token}
export HF_ENDPOINT="https://hf-mirror.com"
model_name="meta-llama/Meta-Llama-3-8B-Instruct"
- docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
```
2. Offline
@@ -215,7 +215,7 @@ For users in China who are unable to download models directly from Huggingface,
```bash
export HF_TOKEN=${your_hf_token}
export model_path="/path/to/model"
- docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
```
### Setup Environment Variables
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 1ec229115e..2427e3e1c3 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -80,7 +80,7 @@ services:
ports:
- "9009:80"
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
index 740f5eba42..7025c9018a 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -144,7 +144,7 @@ services:
ports:
- "9009:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
index 0504ff07a1..40e1992d75 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -80,7 +80,7 @@ services:
ports:
- "6042:80"
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index 70ea084408..e121e77b6b 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -64,7 +64,7 @@ services:
ports:
- "9009:80"
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/README.md b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
index a7a8066202..3d250c7036 100755
--- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
@@ -74,7 +74,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_ENDPOINT="https://hf-mirror.com"
model_name="mistralai/Mistral-7B-Instruct-v0.3"
# Start vLLM LLM Service
- docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
# Start TGI LLM Service
docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
```
@@ -91,7 +91,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_TOKEN=${your_hf_token}
export model_path="/path/to/model"
# Start vLLM LLM Service
- docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
# Start TGI LLM Service
docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
```
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
index 24c8bfdd39..f4aa9f2b95 100644
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
index cf5f2d3c11..d07326598f 100755
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
@@ -66,7 +66,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_ENDPOINT="https://hf-mirror.com"
model_name="mistralai/Mistral-7B-Instruct-v0.3"
# Start vLLM LLM Service
- docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
# Start TGI LLM Service
docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
```
@@ -83,7 +83,7 @@ For users in China who are unable to download models directly from Huggingface,
export HF_TOKEN=${your_hf_token}
export model_path="/path/to/model"
# Start vLLM LLM Service
- docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+ docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
# Start TGI LLM Service
docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
```
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
index 2caeaf0ec3..7fe0538f60 100644
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- "8008:80"
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
index 7da122f9ab..b122f43157 100644
--- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
index fbc8812d58..80d4cd8438 100644
--- a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
From cb831b0cc22a9ebfb248ec46fb309ecfd2e9571f Mon Sep 17 00:00:00 2001
From: "chen, suyue"
Date: Mon, 10 Mar 2025 17:36:26 +0800
Subject: [PATCH 055/226] Enhance ChatQnA test scripts (#1643)
Signed-off-by: chensuyue
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml | 4 ++--
ChatQnA/tests/test_compose_guardrails_on_gaudi.sh | 2 +-
ChatQnA/tests/test_compose_milvus_on_xeon.sh | 5 ++++-
ChatQnA/tests/test_compose_qdrant_on_xeon.sh | 2 +-
ChatQnA/tests/test_compose_tgi_on_gaudi.sh | 2 +-
ChatQnA/tests/test_compose_tgi_on_xeon.sh | 2 +-
ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh | 2 +-
ChatQnA/tests/test_compose_without_rerank_on_xeon.sh | 2 +-
8 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
index 7025c9018a..13306b1bf2 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -113,7 +113,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -127,7 +127,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
index c882a7ef77..855b986af7 100644
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -162,7 +162,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"mega-chatqna" \
"chatqna-gaudi-guardrails-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_milvus_on_xeon.sh b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
index d2953a9992..0a8814954a 100644
--- a/ChatQnA/tests/test_compose_milvus_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
@@ -180,7 +181,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"chatqna-megaservice" \
"chatqna-xeon-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
@@ -240,6 +241,8 @@ function main() {
echo "==== microservices validated ===="
validate_megaservice
echo "==== megaservice validated ===="
+ validate_frontend
+ echo "==== frontend validated ===="
stop_docker
echo y | docker system prune
diff --git a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
index 8c84a9a9ff..fe66abaf12 100644
--- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
@@ -162,7 +162,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8912/v1/chatqna" \
- "data: " \
+ "Nike" \
"mega-chatqna" \
"chatqna-xeon-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
index 25bfe8cdee..483df8ef97 100644
--- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -182,7 +182,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"chatqna-megaservice" \
"chatqna-gaudi-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_tgi_on_xeon.sh b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
index f00d8c6436..1f871a38f6 100644
--- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -181,7 +181,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"chatqna-megaservice" \
"chatqna-xeon-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
index c9dc86a0bd..bc60054291 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -171,7 +171,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"chatqna-megaservice" \
"chatqna-gaudi-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
index 279bc780d0..66c6fe420e 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -174,7 +174,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_service \
"${ip_address}:8888/v1/chatqna" \
- "data: " \
+ "Nike" \
"chatqna-megaservice" \
"chatqna-xeon-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
From ffa0eadb7e76d56c8705d4dd198e4fc1f89f9f6c Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao"
Date: Wed, 12 Mar 2025 10:56:07 +0800
Subject: [PATCH 056/226] Add GitHub Action to check and close stale issues and
PRs (#1646)
Signed-off-by: Sun, Xuehao
Signed-off-by: Chingis Yundunov
---
.../workflows/daily_check_issue_and_pr.yml | 29 +++++++++++++++++++
1 file changed, 29 insertions(+)
create mode 100644 .github/workflows/daily_check_issue_and_pr.yml
diff --git a/.github/workflows/daily_check_issue_and_pr.yml b/.github/workflows/daily_check_issue_and_pr.yml
new file mode 100644
index 0000000000..b578580602
--- /dev/null
+++ b/.github/workflows/daily_check_issue_and_pr.yml
@@ -0,0 +1,29 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Check stale issue and pr
+
+on:
+ schedule:
+ - cron: "30 22 * * *"
+
+jobs:
+ close-issues:
+ runs-on: ubuntu-latest
+ permissions:
+ issues: write
+ pull-requests: write
+ steps:
+ - uses: actions/stale@v9
+ with:
+ days-before-issue-stale: 60
+ days-before-pr-stale: 60
+ days-before-issue-close: 7
+ days-before-pr-close: 7
+ stale-issue-message: "This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+ stale-pr-message: "This PR is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+ close-issue-message: "This issue was closed because it has been stalled for 7 days with no activity."
+ close-pr-message: "This PR was closed because it has been stalled for 7 days with no activity."
+ repo-token: ${{ secrets.ACTION_TOKEN }}
+ start-date: "2025-01-01T00:00:00Z"
+ debug-only: true # will remove this line when ready to merge
From b725c267380aab8c8bcd99baf8298d31f7dbd59e Mon Sep 17 00:00:00 2001
From: Eero Tamminen
Date: Thu, 13 Mar 2025 02:23:07 +0200
Subject: [PATCH 057/226] Use GenAIComp base image to simplify Dockerfiles &
reduce image sizes - part 2 (#1638)
Signed-off-by: Eero Tamminen
Signed-off-by: Chingis Yundunov
---
ChatQnA/Dockerfile | 44 ++-----------------------------
ChatQnA/Dockerfile.guardrails | 44 ++-----------------------------
ChatQnA/Dockerfile.without_rerank | 44 ++-----------------------------
DocSum/Dockerfile | 44 ++-----------------------------
GraphRAG/Dockerfile | 44 ++-----------------------------
SearchQnA/Dockerfile | 44 ++-----------------------------
Translation/Dockerfile | 44 ++-----------------------------
VisualQnA/Dockerfile | 44 ++-----------------------------
8 files changed, 16 insertions(+), 336 deletions(-)
diff --git a/ChatQnA/Dockerfile b/ChatQnA/Dockerfile
index fb7f5e14ec..fffb8d8970 100644
--- a/ChatQnA/Dockerfile
+++ b/ChatQnA/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./chatqna.py $HOME/chatqna.py
diff --git a/ChatQnA/Dockerfile.guardrails b/ChatQnA/Dockerfile.guardrails
index 4fe5fd2087..07a358d922 100644
--- a/ChatQnA/Dockerfile.guardrails
+++ b/ChatQnA/Dockerfile.guardrails
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./chatqna.py $HOME/chatqna.py
diff --git a/ChatQnA/Dockerfile.without_rerank b/ChatQnA/Dockerfile.without_rerank
index 9e6740e9b8..ad1611110a 100644
--- a/ChatQnA/Dockerfile.without_rerank
+++ b/ChatQnA/Dockerfile.without_rerank
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./chatqna.py $HOME/chatqna.py
diff --git a/DocSum/Dockerfile b/DocSum/Dockerfile
index fd01f3bca0..2cc8c3d5a5 100644
--- a/DocSum/Dockerfile
+++ b/DocSum/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
USER root
# FFmpeg needed for media processing
diff --git a/GraphRAG/Dockerfile b/GraphRAG/Dockerfile
index 1e50649dd5..0c2c91d85f 100644
--- a/GraphRAG/Dockerfile
+++ b/GraphRAG/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./graphrag.py $HOME/graphrag.py
diff --git a/SearchQnA/Dockerfile b/SearchQnA/Dockerfile
index df8d536b08..a93afd6093 100644
--- a/SearchQnA/Dockerfile
+++ b/SearchQnA/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./searchqna.py $HOME/searchqna.py
diff --git a/Translation/Dockerfile b/Translation/Dockerfile
index 70266c9b87..853935af84 100644
--- a/Translation/Dockerfile
+++ b/Translation/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./translation.py $HOME/translation.py
diff --git a/VisualQnA/Dockerfile b/VisualQnA/Dockerfile
index 257b39df89..95936d9c03 100644
--- a/VisualQnA/Dockerfile
+++ b/VisualQnA/Dockerfile
@@ -1,48 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
- mkdir -p $HOME && \
- chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
- pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG
COPY ./visualqna.py $HOME/visualqna.py
From faf8f09e87bf9a6b064c689de3e2a0c08568cf1d Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Thu, 13 Mar 2025 09:39:42 +0800
Subject: [PATCH 058/226] Enable inject_commit to docker image feature. (#1653)
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
.github/workflows/nightly-docker-build-publish.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/nightly-docker-build-publish.yml b/.github/workflows/nightly-docker-build-publish.yml
index 84e1fe88bc..3a05fae1df 100644
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -44,6 +44,7 @@ jobs:
node: gaudi
example: ${{ matrix.example }}
test_compose: true
+ inject_commit: true
secrets: inherit
get-image-list:
From 6e262af6ce165684e9a1baf540c50ec487e729e3 Mon Sep 17 00:00:00 2001
From: xiguiw <111278656+xiguiw@users.noreply.github.com>
Date: Thu, 13 Mar 2025 10:38:47 +0800
Subject: [PATCH 059/226] Enable CodeGen vLLM (#1636)
Signed-off-by: Wang, Xigui
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
.../docker_compose/intel/cpu/xeon/README.md | 152 ++++++++++--------
.../intel/cpu/xeon/compose.yaml | 60 +++++--
.../docker_compose/intel/hpu/gaudi/README.md | 152 ++++++++++--------
.../intel/hpu/gaudi/compose.yaml | 66 ++++++--
CodeGen/docker_compose/set_env.sh | 14 +-
CodeGen/docker_image_build/build.yaml | 12 ++
CodeGen/tests/test_compose_on_gaudi.sh | 78 ++++++---
CodeGen/tests/test_compose_on_xeon.sh | 79 ++++++---
8 files changed, 419 insertions(+), 194 deletions(-)
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md
index 01ee5d1fa4..3cc7a19b3c 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md
@@ -1,6 +1,7 @@
# Build MegaService of CodeGen on Xeon
This document outlines the deployment process for a CodeGen application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker images creation, container deployment via Docker Compose, and service execution to integrate microservices such as `llm`. We will publish the Docker images to Docker Hub soon, further simplifying the deployment process for this service.
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice.
## 🚀 Create an AWS Xeon Instance
@@ -10,55 +11,6 @@ For detailed information about these instance types, you can refer to [m7i](http
After launching your instance, you can connect to it using SSH (for Linux instances) or Remote Desktop Protocol (RDP) (for Windows instances). From there, you'll have full access to your Xeon server, allowing you to install, configure, and manage your applications as needed.
-## 🚀 Download or Build Docker Images
-
-Should the Docker image you seek not yet be available on Docker Hub, you can build the Docker image locally.
-
-### 1. Build the LLM Docker Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
-```
-
-### 2. Build the MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `codegen.py` Python script. Build MegaService Docker image via the command below:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/CodeGen
-docker build -t opea/codegen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build the UI Docker Image
-
-Build the frontend Docker image via the command below:
-
-```bash
-cd GenAIExamples/CodeGen/ui
-docker build -t opea/codegen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-```
-
-### 4. Build CodeGen React UI Docker Image (Optional)
-
-Build react frontend Docker image via below command:
-
-**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
-
-```bash
-cd GenAIExamples/CodeGen/ui
-docker build --no-cache -t opea/codegen-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-- `opea/llm-textgen:latest`
-- `opea/codegen:latest`
-- `opea/codegen-ui:latest`
-- `opea/codegen-react-ui:latest` (optional)
-
## 🚀 Start Microservices and MegaService
The CodeGen megaservice manages a single microservice called LLM within a Directed Acyclic Graph (DAG). In the diagram above, the LLM microservice is a language model microservice that generates code snippets based on the user's input query. The TGI service serves as a text generation interface, providing a RESTful API for the LLM microservice. The CodeGen Gateway acts as the entry point for the CodeGen application, invoking the Megaservice to generate code snippets in response to the user's input query.
@@ -89,42 +41,57 @@ flowchart LR
Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-**Append the value of the public IP address to the no_proxy list**
+1. set the host_ip and huggingface token
+> Note:
+> Please replace the `your_ip_address` with you external IP address, do not use `localhost`.
+
+```bash
+export host_ip=${your_ip_address}
+export HUGGINGFACEHUB_API_TOKEN=you_huggingface_token
```
-export your_no_proxy=${your_no_proxy},"External_Public_IP"
-```
+
+2. Set Netowork Proxy
+
+**If you access public network through proxy, set the network proxy, otherwise, skip this step**
```bash
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
-export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8028"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7778/v1/codegen"
+export https_proxy=${your_https_proxy}
```
-Note: Please replace the `host_ip` with you external IP address, do not use `localhost`.
-
### Start the Docker Containers for All Services
+CodeGen support TGI service and vLLM service, you can choose start either one of them.
+
+Start CodeGen based on TGI service:
+
```bash
-cd GenAIExamples/CodeGen/docker_compose/intel/cpu/xeon
-docker compose up -d
+cd GenAIExamples/CodeGen/docker_compose
+source set_env.sh
+cd intel/cpu/xeon
+docker compose --profile codegen-xeon-tgi up -d
+```
+
+Start CodeGen based on vLLM service:
+
+```bash
+cd GenAIExamples/CodeGen/docker_compose
+source set_env.sh
+cd intel/cpu/xeon
+docker compose --profile codegen-xeon-vllm up -d
```
### Validate the MicroServices and MegaService
-1. TGI Service
+1. LLM Service (for TGI, vLLM)
```bash
- curl http://${host_ip}:8028/generate \
- -X POST \
- -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' \
- -H 'Content-Type: application/json'
+ curl http://${host_ip}:8028/v1/chat/completions \
+ -X POST \
+ -d '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}], "max_tokens":32}' \
+ -H 'Content-Type: application/json'
```
2. LLM Microservices
@@ -257,3 +224,52 @@ For example:
- Ask question and get answer

+
+## 🚀 Download or Build Docker Images
+
+Should the Docker image you seek not yet be available on Docker Hub, you can build the Docker image locally.
+
+### 1. Build the LLM Docker Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+```
+
+### 2. Build the MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `codegen.py` Python script. Build MegaService Docker image via the command below:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/CodeGen
+docker build -t opea/codegen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build the UI Docker Image
+
+Build the frontend Docker image via the command below:
+
+```bash
+cd GenAIExamples/CodeGen/ui
+docker build -t opea/codegen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
+```
+
+### 4. Build CodeGen React UI Docker Image (Optional)
+
+Build react frontend Docker image via below command:
+
+**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
+
+```bash
+cd GenAIExamples/CodeGen/ui
+docker build --no-cache -t opea/codegen-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker Images:
+
+- `opea/llm-textgen:latest`
+- `opea/codegen:latest`
+- `opea/codegen-ui:latest`
+- `opea/codegen-react-ui:latest` (optional)
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
index 7973951000..5567d9e368 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -4,7 +4,9 @@
services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
- container_name: tgi-service
+ container_name: tgi-server
+ profiles:
+ - codegen-xeon-tgi
ports:
- "8028:80"
volumes:
@@ -22,28 +24,66 @@ services:
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
- llm:
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ container_name: vllm-server
+ profiles:
+ - codegen-xeon-vllm
+ ports:
+ - "8028:80"
+ volumes:
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ host_ip: ${host_ip}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:8028/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
+ llm-base:
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
container_name: llm-textgen-server
- depends_on:
- tgi-service:
- condition: service_healthy
- ports:
- - "9000:9000"
- ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
+ llm-tgi-service:
+ extends: llm-base
+ container_name: llm-codegen-tgi-server
+ profiles:
+ - codegen-xeon-tgi
+ ports:
+ - "9000:9000"
+ ipc: host
+ depends_on:
+ tgi-service:
+ condition: service_healthy
+ llm-vllm-service:
+ extends: llm-base
+ container_name: llm-codegen-vllm-server
+ profiles:
+ - codegen-xeon-vllm
+ ports:
+ - "9000:9000"
+ ipc: host
+ depends_on:
+ vllm-service:
+ condition: service_healthy
codegen-xeon-backend-server:
image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
container_name: codegen-xeon-backend-server
depends_on:
- - llm
+ - llm-base
ports:
- "7778:7778"
environment:
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
index 106f7d1ffc..133b32f09f 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
@@ -2,54 +2,7 @@
This document outlines the deployment process for a CodeGen application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi2 server. The steps include Docker images creation, container deployment via Docker Compose, and service execution to integrate microservices such as `llm`. We will publish the Docker images to the Docker Hub soon, further simplifying the deployment process for this service.
-## 🚀 Build Docker Images
-
-First of all, you need to build the Docker images locally. This step can be ignored after the Docker images published to the Docker Hub.
-
-### 1. Build the LLM Docker Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
-```
-
-### 2. Build the MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `codegen.py` Python script. Build the MegaService Docker image via the command below:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/CodeGen
-docker build -t opea/codegen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 3. Build the UI Docker Image
-
-Construct the frontend Docker image via the command below:
-
-```bash
-cd GenAIExamples/CodeGen/ui
-docker build -t opea/codegen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-```
-
-### 4. Build CodeGen React UI Docker Image (Optional)
-
-Build react frontend Docker image via below command:
-
-**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
-
-```bash
-cd GenAIExamples/CodeGen/ui
-docker build --no-cache -t opea/codegen-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker images:
-
-- `opea/llm-textgen:latest`
-- `opea/codegen:latest`
-- `opea/codegen-ui:latest`
-- `opea/codegen-react-ui:latest`
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice.
## 🚀 Start MicroServices and MegaService
@@ -81,37 +34,57 @@ flowchart LR
Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+1. set the host_ip and huggingface token
+
+> [!NOTE]
+> Please replace the `your_ip_address` with you external IP address, do not use `localhost`.
+
+```bash
+export host_ip=${your_ip_address}
+export HUGGINGFACEHUB_API_TOKEN=you_huggingface_token
+```
+
+2. Set Netowork Proxy
+
+**If you access public network through proxy, set the network proxy, otherwise, skip this step**
+
```bash
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
-export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8028"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7778/v1/codegen"
+export https_proxy=${your_https_proxy}
```
-> [!NOTE]
-> Please replace the `host_ip` with you external IP address, do not use `localhost`.
-
### Start the Docker Containers for All Services
+CodeGen support TGI service and vLLM service, you can choose start either one of them.
+
+Start CodeGen based on TGI service:
+
+```bash
+cd GenAIExamples/CodeGen/docker_compose
+source set_env.sh
+cd intel/hpu/gaudi
+docker compose --profile codegen-gaudi-tgi up -d
+```
+
+Start CodeGen based on vLLM service:
+
```bash
-cd GenAIExamples/CodeGen/docker_compose/intel/hpu/gaudi
-docker compose up -d
+cd GenAIExamples/CodeGen/docker_compose
+source set_env.sh
+cd intel/hpu/gaudi
+docker compose --profile codegen-gaudi-vllm up -d
```
### Validate the MicroServices and MegaService
-1. TGI Service
+1. LLM Service (for TGI, vLLM)
```bash
- curl http://${host_ip}:8028/generate \
- -X POST \
- -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' \
- -H 'Content-Type: application/json'
+ curl http://${host_ip}:8028/v1/chat/completions \
+ -X POST \
+ -d '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}], "max_tokens":32}' \
+ -H 'Content-Type: application/json'
```
2. LLM Microservices
@@ -240,3 +213,52 @@ For example:
- Ask question and get answer

+
+## 🚀 Build Docker Images
+
+First of all, you need to build the Docker images locally. This step can be ignored after the Docker images published to the Docker Hub.
+
+### 1. Build the LLM Docker Image
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+```
+
+### 2. Build the MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `codegen.py` Python script. Build the MegaService Docker image via the command below:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/CodeGen
+docker build -t opea/codegen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 3. Build the UI Docker Image
+
+Construct the frontend Docker image via the command below:
+
+```bash
+cd GenAIExamples/CodeGen/ui
+docker build -t opea/codegen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
+```
+
+### 4. Build CodeGen React UI Docker Image (Optional)
+
+Build react frontend Docker image via below command:
+
+**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
+
+```bash
+cd GenAIExamples/CodeGen/ui
+docker build --no-cache -t opea/codegen-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+Then run the command `docker images`, you will have the following Docker images:
+
+- `opea/llm-textgen:latest`
+- `opea/codegen:latest`
+- `opea/codegen-ui:latest`
+- `opea/codegen-react-ui:latest`
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
index 19a77bef54..2f669e9465 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -5,6 +5,8 @@ services:
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
+ profiles:
+ - codegen-gaudi-tgi
ports:
- "8028:80"
volumes:
@@ -30,28 +32,74 @@ services:
- SYS_NICE
ipc: host
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
- llm:
- image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
- container_name: llm-textgen-gaudi-server
- depends_on:
- tgi-service:
- condition: service_healthy
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+ container_name: vllm-gaudi-server
+ profiles:
+ - codegen-gaudi-vllm
ports:
- - "9000:9000"
+ - "8028:80"
+ volumes:
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HABANA_VISIBLE_DEVICES: all
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+ NUM_CARDS: ${NUM_CARDS:-1}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:8028/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ runtime: habana
+ cap_add:
+ - SYS_NICE
ipc: host
+ command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256
+ llm-base:
+ image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+ container_name: llm-textgen-gaudi-server
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+ LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
+ llm-tgi-service:
+ extends: llm-base
+ container_name: llm-codegen-tgi-gaudi-server
+ profiles:
+ - codegen-gaudi-tgi
+ ports:
+ - "9000:9000"
+ ipc: host
+ depends_on:
+ tgi-service:
+ condition: service_healthy
+ llm-vllm-service:
+ extends: llm-base
+ container_name: llm-codegen-gaudi-vllm-server
+ profiles:
+ - codegen-gaudi-vllm
+ ports:
+ - "9000:9000"
+ ipc: host
+ depends_on:
+ vllm-service:
+ condition: service_healthy
codegen-gaudi-backend-server:
image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
container_name: codegen-gaudi-backend-server
depends_on:
- - llm
+ - llm-base
ports:
- "7778:7778"
environment:
diff --git a/CodeGen/docker_compose/set_env.sh b/CodeGen/docker_compose/set_env.sh
index 3144ef9589..cb9e742847 100644
--- a/CodeGen/docker_compose/set_env.sh
+++ b/CodeGen/docker_compose/set_env.sh
@@ -6,9 +6,21 @@ pushd "../../" > /dev/null
source .set_env.sh
popd > /dev/null
+export host_ip=$(hostname -I | awk '{print $1}')
+
+if [ -z "${HUGGINGFACEHUB_API_TOKEN}" ]; then
+ echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set HUGGINGFACEHUB_API_TOKEN"
+fi
+
+if [ -z "${host_ip}" ]; then
+ echo "Error: host_ip is not set. Please set host_ip first."
+fi
+
+export no_proxy=${no_proxy},${host_ip}
export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8028"
+export LLM_ENDPOINT="http://${host_ip}:8028"
export MEGA_SERVICE_HOST_IP=${host_ip}
export LLM_SERVICE_HOST_IP=${host_ip}
export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7778/v1/codegen"
+export MODEL_CACHE="./data"
diff --git a/CodeGen/docker_image_build/build.yaml b/CodeGen/docker_image_build/build.yaml
index aaee45977a..529984e35c 100644
--- a/CodeGen/docker_image_build/build.yaml
+++ b/CodeGen/docker_image_build/build.yaml
@@ -29,3 +29,15 @@ services:
dockerfile: comps/llms/src/text-generation/Dockerfile
extends: codegen
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+ vllm:
+ build:
+ context: vllm
+ dockerfile: Dockerfile.cpu
+ extends: codegen
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ vllm-gaudi:
+ build:
+ context: vllm-fork
+ dockerfile: Dockerfile.hpu
+ extends: codegen
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
diff --git a/CodeGen/tests/test_compose_on_gaudi.sh b/CodeGen/tests/test_compose_on_gaudi.sh
index e6e6d1f033..c7b6b83f7e 100644
--- a/CodeGen/tests/test_compose_on_gaudi.sh
+++ b/CodeGen/tests/test_compose_on_gaudi.sh
@@ -30,34 +30,44 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ # Download Gaudi vllm of latest tag
+ git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="codegen codegen-ui llm-textgen"
+ service_list="codegen codegen-ui llm-textgen vllm-gaudi"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
function start_services() {
+ local compose_profile="$1"
+ local llm_container_name="$2"
+
cd $WORKPATH/docker_compose/intel/hpu/gaudi
- export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
- export TGI_LLM_ENDPOINT="http://${ip_address}:8028"
+ export http_proxy=${http_proxy}
+ export https_proxy=${https_proxy}
+ export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
+ export LLM_ENDPOINT="http://${ip_address}:8028"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export MEGA_SERVICE_HOST_IP=${ip_address}
export LLM_SERVICE_HOST_IP=${ip_address}
export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7778/v1/codegen"
+ export NUM_CARDS=1
export host_ip=${ip_address}
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
# Start Docker Containers
- docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+ docker compose --profile ${compose_profile} up -d | tee ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 100 ]]; do
- docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log
- if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+ docker logs ${llm_container_name} > ${LOG_PATH}/llm_service_start.log 2>&1
+ if grep -E "Connected|complete" ${LOG_PATH}/llm_service_start.log; then
break
fi
sleep 5s
@@ -94,13 +104,15 @@ function validate_services() {
}
function validate_microservices() {
+ local llm_container_name="$1"
+
# tgi for llm service
validate_services \
- "${ip_address}:8028/generate" \
- "generated_text" \
- "tgi-llm" \
- "tgi-gaudi-server" \
- '{"inputs":"def print_hello_world():","parameters":{"max_new_tokens":256, "do_sample": true}}'
+ "${ip_address}:8028/v1/chat/completions" \
+ "completion_tokens" \
+ "llm-service" \
+ "${llm_container_name}" \
+ '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "def print_hello_world():"}], "max_tokens": 256}'
# llm microservice
validate_services \
@@ -152,24 +164,50 @@ function validate_frontend() {
}
function stop_docker() {
+ local docker_profile="$1"
+
cd $WORKPATH/docker_compose/intel/hpu/gaudi
- docker compose stop && docker compose rm -f
+ docker compose --profile ${docker_profile} down
}
function main() {
+ # all docker docker compose profiles for XEON Platform
+ docker_compose_profiles=("codegen-gaudi-vllm" "codegen-gaudi-tgi")
+ docker_llm_container_names=("vllm-gaudi-server" "tgi-gaudi-server")
- stop_docker
+ # get number of profiels and container
+ len_profiles=${#docker_compose_profiles[@]}
+ len_containers=${#docker_llm_container_names[@]}
+ # number of profiels and docker container names must be matched
+ if [ ${len_profiles} -ne ${len_containers} ]; then
+ echo "Error: number of profiles ${len_profiles} and container names ${len_containers} mismatched"
+ exit 1
+ fi
+
+ # stop_docker, stop all profiles
+ for ((i = 0; i < len_profiles; i++)); do
+ stop_docker "${docker_compose_profiles[${i}]}"
+ done
+
+ # build docker images
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
- start_services
- validate_microservices
- validate_megaservice
- validate_frontend
+ # loop all profiles
+ for ((i = 0; i < len_profiles; i++)); do
+ echo "Process [${i}]: ${docker_compose_profiles[$i]}, ${docker_llm_container_names[${i}]}"
+ start_services "${docker_compose_profiles[${i}]}" "${docker_llm_container_names[${i}]}"
+ docker ps -a
- stop_docker
- echo y | docker system prune
+ validate_microservices "${docker_llm_container_names[${i}]}"
+ validate_megaservice
+ validate_frontend
+
+ stop_docker "${docker_compose_profiles[${i}]}"
+ sleep 5s
+ done
+ echo y | docker system prune
}
main
diff --git a/CodeGen/tests/test_compose_on_xeon.sh b/CodeGen/tests/test_compose_on_xeon.sh
index 70e5ba9c4f..6fc25963ac 100644
--- a/CodeGen/tests/test_compose_on_xeon.sh
+++ b/CodeGen/tests/test_compose_on_xeon.sh
@@ -31,8 +31,14 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ git clone https://github.com/vllm-project/vllm.git && cd vllm
+ VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null
+ cd ../
+
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="codegen codegen-ui llm-textgen"
+ service_list="codegen codegen-ui llm-textgen vllm"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
@@ -40,10 +46,13 @@ function build_docker_images() {
}
function start_services() {
+ local compose_profile="$1"
+ local llm_container_name="$2"
+
cd $WORKPATH/docker_compose/intel/cpu/xeon/
- export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
- export TGI_LLM_ENDPOINT="http://${ip_address}:8028"
+ export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
+ export LLM_ENDPOINT="http://${ip_address}:8028"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export MEGA_SERVICE_HOST_IP=${ip_address}
export LLM_SERVICE_HOST_IP=${ip_address}
@@ -53,12 +62,12 @@ function start_services() {
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
# Start Docker Containers
- docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+ docker compose --profile ${compose_profile} up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 100 ]]; do
- docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
- if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+ docker logs ${llm_container_name} > ${LOG_PATH}/llm_service_start.log 2>&1
+ if grep -E "Connected|complete" ${LOG_PATH}/llm_service_start.log; then
break
fi
sleep 5s
@@ -95,13 +104,15 @@ function validate_services() {
}
function validate_microservices() {
+ local llm_container_name="$1"
+
# tgi for llm service
validate_services \
- "${ip_address}:8028/generate" \
- "generated_text" \
- "tgi-llm" \
- "tgi-service" \
- '{"inputs":"def print_hello_world():","parameters":{"max_new_tokens":256, "do_sample": true}}'
+ "${ip_address}:8028/v1/chat/completions" \
+ "completion_tokens" \
+ "llm-service" \
+ "${llm_container_name}" \
+ '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'
# llm microservice
validate_services \
@@ -109,7 +120,7 @@ function validate_microservices() {
"data: " \
"llm" \
"llm-textgen-server" \
- '{"query":"def print_hello_world():"}'
+ '{"query":"def print_hello_world():", "max_tokens": 256}'
}
@@ -120,7 +131,7 @@ function validate_megaservice() {
"print" \
"mega-codegen" \
"codegen-xeon-backend-server" \
- '{"messages": "def print_hello_world():"}'
+ '{"messages": "def print_hello_world():", "max_tokens": 256}'
}
@@ -154,24 +165,50 @@ function validate_frontend() {
function stop_docker() {
+ local docker_profile="$1"
+
cd $WORKPATH/docker_compose/intel/cpu/xeon/
- docker compose stop && docker compose rm -f
+ docker compose --profile ${docker_profile} down
}
function main() {
+ # all docker docker compose profiles for Xeon Platform
+ docker_compose_profiles=("codegen-xeon-tgi" "codegen-xeon-vllm")
+ docker_llm_container_names=("tgi-server" "vllm-server")
+
+ # get number of profiels and LLM docker container names
+ len_profiles=${#docker_compose_profiles[@]}
+ len_containers=${#docker_llm_container_names[@]}
+
+ # number of profiels and docker container names must be matched
+ if [ ${len_profiles} -ne ${len_containers} ]; then
+ echo "Error: number of profiles ${len_profiles} and container names ${len_containers} mismatched"
+ exit 1
+ fi
- stop_docker
+ # stop_docker, stop all profiles
+ for ((i = 0; i < len_profiles; i++)); do
+ stop_docker "${docker_compose_profiles[${i}]}"
+ done
+ # build docker images
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
- start_services
- validate_microservices
- validate_megaservice
- validate_frontend
+ # loop all profiles
+ for ((i = 0; i < len_profiles; i++)); do
+ echo "Process [${i}]: ${docker_compose_profiles[$i]}, ${docker_llm_container_names[${i}]}"
+ docker ps -a
+ start_services "${docker_compose_profiles[${i}]}" "${docker_llm_container_names[${i}]}"
- stop_docker
- echo y | docker system prune
+ validate_microservices "${docker_llm_container_names[${i}]}"
+ validate_megaservice
+ validate_frontend
+ stop_docker "${docker_compose_profiles[${i}]}"
+ sleep 5s
+ done
+
+ echo y | docker system prune
}
main
From ceffcffa083e2e99ceb6b36aed91c666b52c9d6f Mon Sep 17 00:00:00 2001
From: Li Gang
Date: Thu, 13 Mar 2025 10:52:33 +0800
Subject: [PATCH 060/226] [ChatQnA][docker]Check healthy of redis to avoid
dataprep failure (#1591)
Signed-off-by: Li Gang
Signed-off-by: Chingis Yundunov
---
ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml | 11 +++++++++--
ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 11 +++++++++--
2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 2427e3e1c3..47e3b73494 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,12 +8,19 @@ services:
ports:
- "6379:6379"
- "8001:8001"
+ healthcheck:
+ test: ["CMD", "redis-cli", "ping"]
+ interval: 5s
+ timeout: 3s
+ retries: 10
dataprep-redis-service:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-redis-server
depends_on:
- - redis-vector-db
- - tei-embedding-service
+ redis-vector-db:
+ condition: service_healthy
+ tei-embedding-service:
+ condition: service_started
ports:
- "6007:5000"
environment:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 8ff06ecc35..6e501cb7bf 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,12 +8,19 @@ services:
ports:
- "6379:6379"
- "8001:8001"
+ healthcheck:
+ test: ["CMD", "redis-cli", "ping"]
+ interval: 5s
+ timeout: 3s
+ retries: 10
dataprep-redis-service:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-redis-server
depends_on:
- - redis-vector-db
- - tei-embedding-service
+ redis-vector-db:
+ condition: service_healthy
+ tei-embedding-service:
+ condition: service_started
ports:
- "6007:5000"
environment:
From 7a4e2a7d1c9852902f0d1a78649b5f83979d4969 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Thu, 13 Mar 2025 11:23:03 +0800
Subject: [PATCH 061/226] Enable GraphRAG and ProductivitySuite model cache for
docker compose test. (#1608)
Signed-off-by: ZePan110
Signed-off-by: Chingis Yundunov
---
GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml | 4 ++--
GraphRAG/tests/test_compose_on_gaudi.sh | 1 +
.../docker_compose/intel/cpu/xeon/compose.yaml | 8 ++++----
ProductivitySuite/tests/test_compose_on_xeon.sh | 1 +
4 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
index 76f1ab9f63..0b1b9d78ad 100644
--- a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -39,7 +39,7 @@ services:
ports:
- "${TEI_EMBEDDER_PORT:-12000}:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -58,7 +58,7 @@ services:
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- - "${DATA_PATH:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/GraphRAG/tests/test_compose_on_gaudi.sh b/GraphRAG/tests/test_compose_on_gaudi.sh
index bec978ad51..4d9a4128d4 100755
--- a/GraphRAG/tests/test_compose_on_gaudi.sh
+++ b/GraphRAG/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
index 149109e4b7..807ef90a7f 100644
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
@@ -39,7 +39,7 @@ services:
ports:
- "6006:80"
volumes:
- - "./data_embedding:/data"
+ - "${MODEL_CACHE:-./data_embedding}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -95,7 +95,7 @@ services:
ports:
- "8808:80"
volumes:
- - "./data_tei:/data"
+ - "${MODEL_CACHE:-./data_tei}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -136,7 +136,7 @@ services:
ports:
- "9009:80"
volumes:
- - "./data:/data"
+ - "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
@@ -209,7 +209,7 @@ services:
ports:
- "8028:80"
volumes:
- - "./data_codegen:/data"
+ - "${MODEL_CACHE:-./data_codegen}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
diff --git a/ProductivitySuite/tests/test_compose_on_xeon.sh b/ProductivitySuite/tests/test_compose_on_xeon.sh
index 333253feb2..b2717d8efc 100755
--- a/ProductivitySuite/tests/test_compose_on_xeon.sh
+++ b/ProductivitySuite/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
From 0e6eacbc2a3b0ee8022001be0413cd764b931aa9 Mon Sep 17 00:00:00 2001
From: ZePan110
Date: Thu, 13 Mar 2025 13:38:53 +0800
Subject: [PATCH 062/226] Enable Gaudi3, Rocm and Arc on manually release test.
(#1615)
1. Enable Gaudi3, Rocm and Arc on manually release test.
2. Fix the issue that manual workflow can't be canceled.
Signed-off-by: ZePan110
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
.github/workflows/_example-workflow.yml | 41 ++++++++++++++++++-
.github/workflows/_run-docker-compose.yml | 9 +++-
.github/workflows/manual-example-workflow.yml | 3 +-
3 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
index f3b717a284..d56099a476 100644
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -50,10 +50,26 @@ on:
type: boolean
jobs:
+ pre-build-image-check:
+ runs-on: ubuntu-latest
+ outputs:
+ should_skip: ${{ steps.check-skip.outputs.should_skip }}
+ steps:
+ - name: Check if job should be skipped
+ id: check-skip
+ run: |
+ if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+ echo "should_skip=true" >> $GITHUB_OUTPUT
+ else
+ echo "should_skip=false" >> $GITHUB_OUTPUT
+ fi
+
####################################################################################################
# Image Build
####################################################################################################
build-images:
+ needs: [pre-build-image-check]
+ if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' }}
runs-on: "docker-build-${{ inputs.node }}"
steps:
- name: Clean Up Working Directory
@@ -105,12 +121,33 @@ jobs:
inject_commit: ${{ inputs.inject_commit }}
tag: ${{ inputs.tag }}
+ pre-compose-test-check:
+ needs: [pre-build-image-check, build-images]
+ if: always()
+ runs-on: ubuntu-latest
+ outputs:
+ run_compose: ${{ steps.check-compose.outputs.run_compose }}
+ steps:
+ - name: Check if job should be skipped
+ id: check-compose
+ run: |
+ set -x
+ run_compose="false"
+ if [[ ${{ inputs.test_compose }} ]]; then
+ if [[ "${{ needs.pre-build-image-check.outputs.should_skip }}" == "false" && "${{ needs.build-images.result}}" == "success" || "${{ needs.pre-build-image-check.outputs.should_skip }}" == "true" ]]; then
+ run_compose="true"
+ fi
+ fi
+ echo "run_compose=$run_compose"
+ echo "run_compose=$run_compose" >> $GITHUB_OUTPUT
+
+
####################################################################################################
# Docker Compose Test
####################################################################################################
test-example-compose:
- needs: [build-images]
- if: ${{ fromJSON(inputs.test_compose) }}
+ needs: [pre-compose-test-check]
+ if: ${{ always() && needs.pre-compose-test-check.outputs.run_compose == 'true' }}
uses: ./.github/workflows/_run-docker-compose.yml
with:
tag: ${{ inputs.tag }}
diff --git a/.github/workflows/_run-docker-compose.yml b/.github/workflows/_run-docker-compose.yml
index f21c3202f9..a84912ed36 100644
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -64,9 +64,14 @@ jobs:
cd ${{ github.workspace }}/${{ inputs.example }}/tests
run_test_cases=""
- default_test_case=$(find . -type f -name "test_compose_on_${{ inputs.hardware }}.sh" | cut -d/ -f2)
+ if [ "${{ inputs.hardware }}" == "gaudi2" ] || [ "${{ inputs.hardware }}" == "gaudi3" ]; then
+ hardware="gaudi"
+ else
+ hardware="${{ inputs.hardware }}"
+ fi
+ default_test_case=$(find . -type f -name "test_compose_on_$hardware.sh" | cut -d/ -f2)
if [ "$default_test_case" ]; then run_test_cases="$default_test_case"; fi
- other_test_cases=$(find . -type f -name "test_compose_*_on_${{ inputs.hardware }}.sh" | cut -d/ -f2)
+ other_test_cases=$(find . -type f -name "test_compose_*_on_$hardware.sh" | cut -d/ -f2)
echo "default_test_case=$default_test_case"
echo "other_test_cases=$other_test_cases"
diff --git a/.github/workflows/manual-example-workflow.yml b/.github/workflows/manual-example-workflow.yml
index 3a98b3d40e..9616f87032 100644
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -7,7 +7,7 @@ on:
inputs:
nodes:
default: "gaudi,xeon"
- description: "Hardware to run test"
+ description: "Hardware to run test gaudi,gaudi3,xeon,rocm,arc"
required: true
type: string
examples:
@@ -96,7 +96,6 @@ jobs:
run-examples:
needs: [get-test-matrix] #[get-test-matrix, build-deploy-gmc]
- if: always()
strategy:
matrix:
example: ${{ fromJson(needs.get-test-matrix.outputs.examples) }}
From a658f80883c9986aada38789811599e91efbaa78 Mon Sep 17 00:00:00 2001
From: CharleneHu-42 <37971369+CharleneHu-42@users.noreply.github.com>
Date: Thu, 13 Mar 2025 13:50:28 +0800
Subject: [PATCH 063/226] Refine README with highlighted examples and updated
support info (#1006)
Signed-off-by: CharleneHu-42
Co-authored-by: Yi Yao
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ying Hu
Signed-off-by: Chingis Yundunov
---
README.md | 16 ++++++++++++++
supported_examples.md | 51 ++++++++++++++++++++++++++++++-------------
2 files changed, 52 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index 369e504200..283ffb12e2 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,22 @@ GenAIExamples are designed to give developers an easy entry into generative AI,
[GenAIEval](https://github.com/opea-project/GenAIEval) measures service performance metrics such as throughput, latency, and accuracy for GenAIExamples. This feature helps users compare performance across various hardware configurations easily.
+## Use Cases
+
+Below are some highlighted GenAI use cases across various application scenarios:
+
+| Scenario | Use Case |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| Question Answering | [ChatQnA](ChatQnA) ✨: Chatbot with Retrieval Augmented Generation (RAG).
[VisualQnA](VisualQnA) ✨: Visual Question-answering. |
+| Image Generation | [Text2Image](Text2Image) ✨: Text-to-image generation. |
+| Content Summarization | [DocSum](DocSum): Document Summarization Application. |
+| FAQ Generation | [FaqGen](FaqGen): Frequently asked questions (FAQs) generation from your documents, legal texts, customer queries etc. |
+| Code Generation | [CodeGen](CodeGen): Gen-AI Powered Code Generator. |
+| Information Retrieval | [DocIndexRetriever](DocIndexRetriever): Document Retrieval with Retrieval Augmented Generation (RAG). |
+| Fine-tuning | [InstructionTuning](InstructionTuning): Application of Instruction Tuning. |
+
+For the full list of the available use cases and their supported deployment type, please refer [here](#deploy-examples).
+
## Documentation
The GenAIExamples [documentation](https://opea-project.github.io/latest/examples/index.html) contains a comprehensive guide on all available examples including architecture, deployment guides, and more. Information on GenAIComps, GenAIInfra, and GenAIEval can also be found there.
diff --git a/supported_examples.md b/supported_examples.md
index 0754be3eee..a0562a7145 100644
--- a/supported_examples.md
+++ b/supported_examples.md
@@ -105,9 +105,9 @@ This document introduces the supported examples of GenAIExamples. The supported
[VisualQnA](./VisualQnA/README.md) is an example of chatbot for question and answering based on the images.
-| LVM | HW | Description |
-| --------------------------------------------------------------------------------------------- | ------ | ----------- |
-| [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | Gaudi2 | Chatbot |
+| LVM | HW | Description |
+| --------------------------------------------------------------------------------------------- | ----------- | ----------- |
+| [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | Xeon/Gaudi2 | Chatbot |
### VideoQnA
@@ -122,27 +122,27 @@ By default, the embedding and LVM models are set to a default value as listed be
### RerankFinetuning
-Rerank model finetuning example is for training rerank model on a dataset for improving its capability on specific field.
+[Rerank model finetuning](./RerankFinetuning/README.md) example is for training rerank model on a dataset for improving its capability on specific field.
By default, the base model is set to a default value as listed below:
-| Service | Base Model | HW | Description |
-| ----------------- | ------------------------------------------------------------------------- | ---- | ------------------------------- |
-| Rerank Finetuning | [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | Xeon | Rerank model finetuning service |
+| Service | Base Model | HW | Description |
+| ----------------- | ------------------------------------------------------------------------- | ----------- | ------------------------------- |
+| Rerank Finetuning | [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | Xeon/Gaudi2 | Rerank model finetuning service |
### InstructionTuning
-The Instruction Tuning example is designed to further train large language models (LLMs) on a dataset consisting of (instruction, output) pairs using supervised learning. This process bridges the gap between the LLM's original objective of next-word prediction and the user’s objective of having the model follow human instructions accurately. By leveraging Instruction Tuning, this example enhances the LLM's ability to better understand and execute specific tasks, improving the model's alignment with user instructions and its overall performance.
+The [Instruction Tuning](./InstructionTuning/README.md) example is designed to further train large language models (LLMs) on a dataset consisting of (instruction, output) pairs using supervised learning. This process bridges the gap between the LLM's original objective of next-word prediction and the user’s objective of having the model follow human instructions accurately. By leveraging Instruction Tuning, this example enhances the LLM's ability to better understand and execute specific tasks, improving the model's alignment with user instructions and its overall performance.
By default, the base model is set to a default value as listed below:
-| Service | Base Model | HW | Description |
-| ----------------- | ------------------------------------------------------------------------------------- | ---------- | ------------------------------------ |
-| InstructionTuning | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | Xeon/Gaudi | LLM model Instruction Tuning service |
+| Service | Base Model | HW | Description |
+| ----------------- | ------------------------------------------------------------------------------------- | ----------- | ------------------------------------ |
+| InstructionTuning | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | Xeon/Gaudi2 | LLM model Instruction Tuning service |
### DocIndexRetriever
-The DocRetriever example demonstrates how to match user queries with free-text records using various retrieval methods. It plays a key role in Retrieval-Augmented Generation (RAG) systems by dynamically fetching relevant information from external sources, ensuring responses are factual and up-to-date. Powered by vector databases, DocRetriever enables efficient, semantic retrieval by storing data as vectors and quickly identifying the most relevant documents based on similarity.
+The [DocRetriever](./DocIndexRetriever/README.md) example demonstrates how to match user queries with free-text records using various retrieval methods. It plays a key role in Retrieval-Augmented Generation (RAG) systems by dynamically fetching relevant information from external sources, ensuring responses are factual and up-to-date. Powered by vector databases, DocRetriever enables efficient, semantic retrieval by storing data as vectors and quickly identifying the most relevant documents based on similarity.
| Framework | Embedding | Vector Database | Serving | HW | Description |
| ------------------------------------------------------------------------------ | --------------------------------------------------- | -------------------------- | --------------------------------------------------------------- | ----------- | -------------------------- |
@@ -150,7 +150,7 @@ The DocRetriever example demonstrates how to match user queries with free-text r
### AgentQnA
-The AgentQnA example demonstrates a hierarchical, multi-agent system designed for question-answering tasks. A supervisor agent interacts directly with the user, delegating tasks to a worker agent and utilizing various tools to gather information and generate answers. The worker agent primarily uses a retrieval tool to respond to the supervisor's queries. Additionally, the supervisor can access other tools, such as APIs to query knowledge graphs, SQL databases, or external knowledge bases, to enhance the accuracy and relevance of its responses.
+The [AgentQnA](./AgentQnA/README.md) example demonstrates a hierarchical, multi-agent system designed for question-answering tasks. A supervisor agent interacts directly with the user, delegating tasks to a worker agent and utilizing various tools to gather information and generate answers. The worker agent primarily uses a retrieval tool to respond to the supervisor's queries. Additionally, the supervisor can access other tools, such as APIs to query knowledge graphs, SQL databases, or external knowledge bases, to enhance the accuracy and relevance of its responses.
Worker agent uses open-source websearch tool (duckduckgo), agents use OpenAI GPT-4o-mini as llm backend.
@@ -158,7 +158,7 @@ Worker agent uses open-source websearch tool (duckduckgo), agents use OpenAI GPT
### AudioQnA
-The AudioQnA example demonstrates the integration of Generative AI (GenAI) models for performing question-answering (QnA) on audio files, with the added functionality of Text-to-Speech (TTS) for generating spoken responses. The example showcases how to convert audio input to text using Automatic Speech Recognition (ASR), generate answers to user queries using a language model, and then convert those answers back to speech using Text-to-Speech (TTS).
+The [AudioQnA](./AudioQnA/README.md) example demonstrates the integration of Generative AI (GenAI) models for performing question-answering (QnA) on audio files, with the added functionality of Text-to-Speech (TTS) for generating spoken responses. The example showcases how to convert audio input to text using Automatic Speech Recognition (ASR), generate answers to user queries using a language model, and then convert those answers back to speech using Text-to-Speech (TTS).
@@ -179,7 +179,7 @@ The AudioQnA example demonstrates the integration of Generative AI (GenAI) model
### FaqGen
-FAQ Generation Application leverages the power of large language models (LLMs) to revolutionize the way you interact with and comprehend complex textual data. By harnessing cutting-edge natural language processing techniques, our application can automatically generate comprehensive and natural-sounding frequently asked questions (FAQs) from your documents, legal texts, customer queries, and other sources. In this example use case, we utilize LangChain to implement FAQ Generation and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
+[FAQ Generation](./FaqGen/README.md) application leverages the power of large language models (LLMs) to revolutionize the way you interact with and comprehend complex textual data. By harnessing cutting-edge natural language processing techniques, our application can automatically generate comprehensive and natural-sounding frequently asked questions (FAQs) from your documents, legal texts, customer queries, and other sources. In this example use case, we utilize LangChain to implement FAQ Generation and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
| Framework | LLM | Serving | HW | Description |
| ------------------------------------------------------------------------------ | ----------------------------------------------------------------- | --------------------------------------------------------------- | ----------- | ----------- |
| [LangChain](https://www.langchain.com)/[LlamaIndex](https://www.llamaindex.ai) | [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [TGI](https://github.com/huggingface/text-generation-inference) | Xeon/Gaudi2 | Chatbot |
@@ -199,3 +199,24 @@ FAQ Generation Application leverages the power of large language models (LLMs) t
### ProductivitySuite
[Productivity Suite](./ProductivitySuite/README.md) streamlines your workflow to boost productivity. It leverages the power of OPEA microservices to deliver a comprehensive suite of features tailored to meet the diverse needs of modern enterprises.
+
+### DBQnA
+
+[DBQnA](./DBQnA/README.md) converts your natural language query into an SQL query, automatically executes the generated query on the database and delivers real-time query results.
+| Framework | LLM | Database | HW | Description |
+|----------------------------------------|-------------------------------------------------------------------------------------------------|-------------------------------------------|------|----------------------------|
+| [LangChain](https://www.langchain.com) | [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | [PostgresDB](https://www.postgresql.org/) | Xeon | Natural language SQL agent |
+
+### Text2Image
+
+[Text2Image](./Text2Image/README.md) generates image based on your provided text.
+| Framework | LDM | HW | Description |
+|----------------------------------------|--------------------------------------------------------------------------------------------------------|-------------|-------------|
+| [LangChain](https://www.langchain.com) | [stabilityai/stable-diffusion](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) | Xeon/Gaudi2 | Text2Image |
+
+### AvatarChatbot
+
+[AvatarChatbot](./AvatarChatbot/README.md) example is a chatbot with a visual character that provides users dynamic, engaging interactions, by leveraging multiple generative AI components including LLM, ASR (Audio-Speech-Recognition), and TTS (Text-To-Speech).
+| LLM | ASR | TTS | Animation | HW | Description |
+|-------------------------------------------------------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|-------------|----------------------------|
+| [Intel/neural-chat-7b-v3-3](https://huggingface.co/Intel/neural-chat-7b-v3-3) | [openai/whisper-small](https://huggingface.co/openai/whisper-small) | [microsoft/SpeechT5](https://huggingface.co/microsoft/speecht5_tts) | [Rudrabha/Wav2Lip](https://github.com/Rudrabha/Wav2Lip)
[TencentARC/GFPGAN](https://github.com/TencentARC/GFPGAN) | Xeon/Gaudi2 | Interactive chatbot Avatar |
From d12765ba908826ceeff9f08952d2b06b837c150e Mon Sep 17 00:00:00 2001
From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com>
Date: Fri, 14 Mar 2025 09:56:33 +0800
Subject: [PATCH 064/226] [AudioQnA] Enable vLLM and set it as default LLM
serving (#1657)
Signed-off-by: Wang, Kai Lawrence
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Chingis Yundunov
---
AudioQnA/audioqna.py | 2 +-
AudioQnA/audioqna_multilang.py | 2 +-
.../docker_compose/intel/cpu/xeon/README.md | 111 ++++++++++---
.../intel/cpu/xeon/compose.yaml | 25 +--
.../intel/cpu/xeon/compose_multilang.yaml | 26 ++--
.../intel/cpu/xeon/compose_tgi.yaml | 87 +++++++++++
.../docker_compose/intel/cpu/xeon/set_env.sh | 2 +-
.../docker_compose/intel/hpu/gaudi/README.md | 115 +++++++++++---
.../intel/hpu/gaudi/compose.yaml | 29 ++--
.../intel/hpu/gaudi/compose_tgi.yaml | 108 +++++++++++++
.../docker_compose/intel/hpu/gaudi/set_env.sh | 8 +-
AudioQnA/docker_image_build/build.yaml | 12 ++
AudioQnA/tests/test_compose_on_gaudi.sh | 23 ++-
AudioQnA/tests/test_compose_on_xeon.sh | 19 ++-
AudioQnA/tests/test_compose_tgi_on_gaudi.sh | 146 ++++++++++++++++++
AudioQnA/tests/test_compose_tgi_on_xeon.sh | 137 ++++++++++++++++
16 files changed, 750 insertions(+), 102 deletions(-)
create mode 100644 AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
create mode 100644 AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
create mode 100644 AudioQnA/tests/test_compose_tgi_on_gaudi.sh
create mode 100644 AudioQnA/tests/test_compose_tgi_on_xeon.sh
diff --git a/AudioQnA/audioqna.py b/AudioQnA/audioqna.py
index f74e58053f..dcb59633c0 100644
--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,7 +16,7 @@
SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
diff --git a/AudioQnA/audioqna_multilang.py b/AudioQnA/audioqna_multilang.py
index edc14cc93c..8f4a65e748 100644
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,7 +17,7 @@
GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/README.md b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
index 3f91c02e02..aabaf36595 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -2,6 +2,10 @@
This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
+
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
## 🚀 Build Docker images
### 1. Source Code install GenAIComps
@@ -17,9 +21,15 @@ cd GenAIComps
docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
```
-### 3. Build LLM Image
+### 3. Build vLLM Image
-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu (https://github.com/huggingface/text-generation-inference)
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd ./vllm/
+VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
+```
### 4. Build TTS Image
@@ -43,9 +53,10 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
Then run the command `docker images`, you will have following images ready:
1. `opea/whisper:latest`
-2. `opea/speecht5:latest`
-3. `opea/audioqna:latest`
-4. `opea/gpt-sovits:latest` (optional)
+2. `opea/vllm:latest`
+3. `opea/speecht5:latest`
+4. `opea/audioqna:latest`
+5. `opea/gpt-sovits:latest` (optional)
## 🚀 Set the environment variables
@@ -55,7 +66,7 @@ Before starting the services with `docker compose`, you have to recheck the foll
export host_ip= # export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -73,40 +84,90 @@ export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
or use set_env.sh file to setup environment variables.
-Note: Please replace with host_ip with your external IP address, do not use localhost.
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,gpt-sovits-service,tgi-service,vllm-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server
+```
## 🚀 Start the MegaService
```bash
cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
+```
+
+If use vLLM as the LLM serving backend:
+
+```
docker compose up -d
# multilang tts (optional)
docker compose -f compose_multilang.yaml up -d
```
+If use TGI as the LLM serving backend:
+
+```
+docker compose -f compose_tgi.yaml up -d
+```
+
## 🚀 Test MicroServices
-```bash
-# whisper service
-wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
-curl http://${host_ip}:7066/v1/audio/transcriptions \
- -H "Content-Type: multipart/form-data" \
- -F file="@./sample.wav" \
- -F model="openai/whisper-small"
-
-# tgi service
-curl http://${host_ip}:3006/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json'
+1. Whisper Service
-# speecht5 service
-curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+ ```bash
+ wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+ curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
+ -H "Content-Type: multipart/form-data" \
+ -F file="@./sample.wav" \
+ -F model="openai/whisper-small"
+ ```
-# gpt-sovits service (optional)
-curl http://${host_ip}:9880/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
-```
+2. LLM backend Service
+
+ In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-service` or `tgi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+
+ Or try the command below to check whether the LLM serving is ready.
+
+ ```bash
+ # vLLM service
+ docker logs vllm-service 2>&1 | grep complete
+ # If the service is ready, you will get the response like below.
+ INFO: Application startup complete.
+ ```
+
+ ```bash
+ # TGI service
+ docker logs tgi-service | grep Connected
+ # If the service is ready, you will get the response like below.
+ 2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
+ ```
+
+ Then try the `cURL` command below to validate services.
+
+ ```bash
+ # either vLLM or TGI service
+ curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
+ -X POST \
+ -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. TTS Service
+
+ ```
+ # speecht5 service
+ curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+
+ # gpt-sovits service (optional)
+ curl http://${host_ip}:${GPT_SOVITS_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+ ```
## 🚀 Test MegaService
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 3b47780d80..1fe5e6b2a6 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- - "7066:7066"
+ - ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -17,38 +17,41 @@ services:
image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
container_name: speecht5-service
ports:
- - "7055:7055"
+ - ${SPEECHT5_SERVER_PORT:-7055}:7055
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
restart: unless-stopped
- tgi-service:
- image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
- container_name: tgi-service
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ container_name: vllm-service
ports:
- - "3006:80"
+ - ${LLM_SERVER_PORT:-3006}:80
volumes:
- - "${MODEL_CACHE:-./data}:/data"
- shm_size: 1g
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+ shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
+ LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
- test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+ test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
- command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
audioqna-xeon-backend-server:
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
container_name: audioqna-xeon-backend-server
depends_on:
- whisper-service
- - tgi-service
+ - vllm-service
- speecht5-service
ports:
- "3008:8888"
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
index fde5a56902..3aecacf591 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- - "7066:7066"
+ - ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -18,27 +18,35 @@ services:
image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
container_name: gpt-sovits-service
ports:
- - "9880:9880"
+ - ${GPT_SOVITS_SERVER_PORT:-9880}:9880
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
restart: unless-stopped
- tgi-service:
- image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
- container_name: tgi-service
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ container_name: vllm-service
ports:
- - "3006:80"
+ - ${LLM_SERVER_PORT:-3006}:80
volumes:
- - "${MODEL_CACHE:-./data}:/data"
- shm_size: 1g
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+ shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
- command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
+ LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
audioqna-xeon-backend-server:
image: ${REGISTRY:-opea}/audioqna-multilang:${TAG:-latest}
container_name: audioqna-xeon-backend-server
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
new file mode 100644
index 0000000000..d421f488fd
--- /dev/null
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,87 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - ${WHISPER_SERVER_PORT:-7066}:7066
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+ speecht5-service:
+ image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+ container_name: speecht5-service
+ ports:
+ - ${SPEECHT5_SERVER_PORT:-7055}:7055
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ restart: unless-stopped
+ tgi-service:
+ image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+ container_name: tgi-service
+ ports:
+ - ${LLM_SERVER_PORT:-3006}:80
+ volumes:
+ - "${MODEL_CACHE:-./data}:/data"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+ audioqna-xeon-backend-server:
+ image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+ container_name: audioqna-xeon-backend-server
+ depends_on:
+ - whisper-service
+ - tgi-service
+ - speecht5-service
+ ports:
+ - "3008:8888"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+ - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+ - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+ - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+ - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+ - LLM_MODEL_ID=${LLM_MODEL_ID}
+ - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+ - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
+ ipc: host
+ restart: always
+ audioqna-xeon-ui-server:
+ image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+ container_name: audioqna-xeon-ui-server
+ depends_on:
+ - audioqna-xeon-backend-server
+ ports:
+ - "5173:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh b/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
index e98f6e04ec..adc652f169 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -8,7 +8,7 @@ export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
#
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
index b60253a147..602b99ea22 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,10 @@
This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server.
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
+
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
## 🚀 Build Docker images
### 1. Source Code install GenAIComps
@@ -17,9 +21,13 @@ cd GenAIComps
docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
```
-### 3. Build LLM Image
+### 3. Build vLLM Image
-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork/
+VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .
### 4. Build TTS Image
@@ -40,8 +48,9 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
Then run the command `docker images`, you will have following images ready:
1. `opea/whisper-gaudi:latest`
-2. `opea/speecht5-gaudi:latest`
-3. `opea/audioqna:latest`
+2. `opea/vllm-gaudi:latest`
+3. `opea/speecht5-gaudi:latest`
+4. `opea/audioqna:latest`
## 🚀 Set the environment variables
@@ -51,7 +60,12 @@ Before starting the services with `docker compose`, you have to recheck the foll
export host_ip= # export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -65,37 +79,90 @@ export LLM_SERVER_PORT=3006
export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
```
+or use set_env.sh file to setup environment variables.
+
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,tgi-service,vllm-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server
+```
+
## 🚀 Start the MegaService
> **_NOTE:_** Users will need at least three Gaudi cards for AudioQnA.
```bash
cd GenAIExamples/AudioQnA/docker_compose/intel/hpu/gaudi/
-docker compose up -d
```
-## 🚀 Test MicroServices
-
-```bash
-# whisper service
-curl http://${host_ip}:7066/v1/asr \
- -X POST \
- -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
- -H 'Content-Type: application/json'
+If use vLLM as the LLM serving backend:
-# tgi service
-curl http://${host_ip}:3006/generate \
- -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json'
+```
+docker compose up -d
+```
-# speecht5 service
-curl http://${host_ip}:7055/v1/tts \
- -X POST \
- -d '{"text": "Who are you?"}' \
- -H 'Content-Type: application/json'
+If use TGI as the LLM serving backend:
```
+docker compose -f compose_tgi.yaml up -d
+```
+
+## 🚀 Test MicroServices
+
+1. Whisper Service
+
+ ```bash
+ curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr \
+ -X POST \
+ -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+ -H 'Content-Type: application/json'
+ ```
+
+2. LLM backend Service
+
+ In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-gaudi-service` or `tgi-gaudi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+
+ Or try the command below to check whether the LLM serving is ready.
+
+ ```bash
+ # vLLM service
+ docker logs vllm-gaudi-service 2>&1 | grep complete
+ # If the service is ready, you will get the response like below.
+ INFO: Application startup complete.
+ ```
+
+ ```bash
+ # TGI service
+ docker logs tgi-gaudi-service | grep Connected
+ # If the service is ready, you will get the response like below.
+ 2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
+ ```
+
+ Then try the `cURL` command below to validate services.
+
+ ```bash
+ # either vLLM or TGI service
+ curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
+ -X POST \
+ -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+ -H 'Content-Type: application/json'
+ ```
+
+3. TTS Service
+
+ ```
+ # speecht5 service
+ curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/tts
+ -X POST \
+ -d '{"text": "Who are you?"}' \
+ -H 'Content-Type: application/json'
+ ```
## 🚀 Test MegaService
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 9e43a355b5..db93cd8223 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
container_name: whisper-service
ports:
- - "7066:7066"
+ - ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -22,7 +22,7 @@ services:
image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
container_name: speecht5-service
ports:
- - "7055:7055"
+ - ${SPEECHT5_SERVER_PORT:-7055}:7055
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -34,28 +34,27 @@ services:
cap_add:
- SYS_NICE
restart: unless-stopped
- tgi-service:
- image: ghcr.io/huggingface/tgi-gaudi:2.3.1
- container_name: tgi-gaudi-server
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+ container_name: vllm-gaudi-service
ports:
- - "3006:80"
+ - ${LLM_SERVER_PORT:-3006}:80
volumes:
- - "${MODEL_CACHE:-./data}:/data"
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
- ENABLE_HPU_GRAPH: true
- LIMIT_HPU_GRAPH: true
- USE_FLASH_ATTENTION: true
- FLASH_ATTENTION_RECOMPUTE: true
+ LLM_MODEL_ID: ${LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
+ LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
- test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+ test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
@@ -63,13 +62,13 @@ services:
cap_add:
- SYS_NICE
ipc: host
- command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+ command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
audioqna-gaudi-backend-server:
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
container_name: audioqna-gaudi-backend-server
depends_on:
- whisper-service
- - tgi-service
+ - vllm-service
- speecht5-service
ports:
- "3008:8888"
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
new file mode 100644
index 0000000000..f14bd8cb99
--- /dev/null
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,108 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ whisper-service:
+ image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
+ container_name: whisper-service
+ ports:
+ - ${WHISPER_SERVER_PORT:-7066}:7066
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HABANA_VISIBLE_DEVICES: all
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ runtime: habana
+ cap_add:
+ - SYS_NICE
+ restart: unless-stopped
+ speecht5-service:
+ image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
+ container_name: speecht5-service
+ ports:
+ - ${SPEECHT5_SERVER_PORT:-7055}:7055
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HABANA_VISIBLE_DEVICES: all
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ runtime: habana
+ cap_add:
+ - SYS_NICE
+ restart: unless-stopped
+ tgi-service:
+ image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+ container_name: tgi-gaudi-service
+ ports:
+ - ${LLM_SERVER_PORT:-3006}:80
+ volumes:
+ - "${MODEL_CACHE:-./data}:/data"
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ HABANA_VISIBLE_DEVICES: all
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ ENABLE_HPU_GRAPH: true
+ LIMIT_HPU_GRAPH: true
+ USE_FLASH_ATTENTION: true
+ FLASH_ATTENTION_RECOMPUTE: true
+ LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ runtime: habana
+ cap_add:
+ - SYS_NICE
+ ipc: host
+ command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+ audioqna-gaudi-backend-server:
+ image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+ container_name: audioqna-gaudi-backend-server
+ depends_on:
+ - whisper-service
+ - tgi-service
+ - speecht5-service
+ ports:
+ - "3008:8888"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+ - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+ - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+ - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+ - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+ - LLM_MODEL_ID=${LLM_MODEL_ID}
+ - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+ - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
+ ipc: host
+ restart: always
+ audioqna-gaudi-ui-server:
+ image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+ container_name: audioqna-gaudi-ui-server
+ depends_on:
+ - audioqna-gaudi-backend-server
+ ports:
+ - "5173:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+ ipc: host
+ restart: always
+
+networks:
+ default:
+ driver: bridge
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
index e98f6e04ec..179a8c2a24 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -8,7 +8,13 @@ export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
#
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}
diff --git a/AudioQnA/docker_image_build/build.yaml b/AudioQnA/docker_image_build/build.yaml
index bc9f67d9c0..71bb44c810 100644
--- a/AudioQnA/docker_image_build/build.yaml
+++ b/AudioQnA/docker_image_build/build.yaml
@@ -71,3 +71,15 @@ services:
dockerfile: comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
extends: audioqna
image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
+ vllm:
+ build:
+ context: vllm
+ dockerfile: Dockerfile.cpu
+ extends: audioqna
+ image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+ vllm-gaudi:
+ build:
+ context: vllm-fork
+ dockerfile: Dockerfile.hpu
+ extends: audioqna
+ image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
diff --git a/AudioQnA/tests/test_compose_on_gaudi.sh b/AudioQnA/tests/test_compose_on_gaudi.sh
index fe5cff379a..1e356750e6 100644
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -31,18 +31,27 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ git clone https://github.com/HabanaAI/vllm-fork.git
+ cd vllm-fork/
+ VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
+
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+ service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi vllm-gaudi"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
- export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+ export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+ export NUM_CARDS=1
+ export BLOCK_SIZE=128
+ export MAX_NUM_SEQS=256
+ export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${ip_address}
export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -61,8 +70,8 @@ function start_services() {
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 200 ]]; do
- docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log
- if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+ docker logs vllm-gaudi-service > $LOG_PATH/vllm_service_start.log 2>&1
+ if grep -q complete $LOG_PATH/vllm_service_start.log; then
break
fi
sleep 5s
@@ -86,7 +95,7 @@ function validate_megaservice() {
# always print the log
docker logs whisper-service > $LOG_PATH/whisper-service.log
docker logs speecht5-service > $LOG_PATH/tts-service.log
- docker logs tgi-gaudi-server > $LOG_PATH/tgi-gaudi-server.log
+ docker logs vllm-gaudi-service > $LOG_PATH/vllm-gaudi-service.log
docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
@@ -126,7 +135,7 @@ function validate_megaservice() {
function stop_docker() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
- docker compose stop && docker compose rm -f
+ docker compose -f compose.yaml stop && docker compose rm -f
}
function main() {
diff --git a/AudioQnA/tests/test_compose_on_xeon.sh b/AudioQnA/tests/test_compose_on_xeon.sh
index 11a86ba5c8..b1ff1164d2 100644
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -31,18 +31,23 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+ git clone https://github.com/vllm-project/vllm.git
+ cd ./vllm/
+ VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+ echo "Check out vLLM tag ${VLLM_VER}"
+ git checkout ${VLLM_VER} &> /dev/null && cd ../
+
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="audioqna audioqna-ui whisper speecht5"
+ service_list="audioqna audioqna-ui whisper speecht5 vllm"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
- docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
- export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+ export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
export MEGA_SERVICE_HOST_IP=${ip_address}
export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -62,8 +67,8 @@ function start_services() {
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 200 ]]; do
- docker logs tgi-service > $LOG_PATH/tgi_service_start.log
- if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+ docker logs vllm-service > $LOG_PATH/vllm_service_start.log 2>&1
+ if grep -q complete $LOG_PATH/vllm_service_start.log; then
break
fi
sleep 5s
@@ -77,7 +82,7 @@ function validate_megaservice() {
# always print the log
docker logs whisper-service > $LOG_PATH/whisper-service.log
docker logs speecht5-service > $LOG_PATH/tts-service.log
- docker logs tgi-service > $LOG_PATH/tgi-service.log
+ docker logs vllm-service > $LOG_PATH/vllm-service.log
docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
@@ -117,7 +122,7 @@ function validate_megaservice() {
function stop_docker() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
- docker compose stop && docker compose rm -f
+ docker compose -f compose.yaml stop && docker compose rm -f
}
function main() {
diff --git a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
new file mode 100644
index 0000000000..5a046adfdb
--- /dev/null
+++ b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/hpu/gaudi
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+
+ export MEGA_SERVICE_HOST_IP=${ip_address}
+ export WHISPER_SERVER_HOST_IP=${ip_address}
+ export SPEECHT5_SERVER_HOST_IP=${ip_address}
+ export LLM_SERVER_HOST_IP=${ip_address}
+
+ export WHISPER_SERVER_PORT=7066
+ export SPEECHT5_SERVER_PORT=7055
+ export LLM_SERVER_PORT=3006
+
+ export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
+ export host_ip=${ip_address}
+ # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+ # Start Docker Containers
+ docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+ n=0
+ until [[ "$n" -ge 200 ]]; do
+ docker logs tgi-gaudi-service > $LOG_PATH/tgi_service_start.log
+ if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+ break
+ fi
+ sleep 5s
+ n=$((n+1))
+ done
+
+ n=0
+ until [[ "$n" -ge 100 ]]; do
+ docker logs whisper-service > $LOG_PATH/whisper_service_start.log
+ if grep -q "Uvicorn server setup on port" $LOG_PATH/whisper_service_start.log; then
+ break
+ fi
+ sleep 5s
+ n=$((n+1))
+ done
+}
+
+
+function validate_megaservice() {
+ response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+ # always print the log
+ docker logs whisper-service > $LOG_PATH/whisper-service.log
+ docker logs speecht5-service > $LOG_PATH/tts-service.log
+ docker logs tgi-gaudi-service > $LOG_PATH/tgi-gaudi-service.log
+ docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
+ echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+ if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+ echo "Result correct."
+ else
+ echo "Result wrong."
+ exit 1
+ fi
+
+}
+
+#function validate_frontend() {
+# cd $WORKPATH/ui/svelte
+# local conda_env_name="OPEA_e2e"
+# export PATH=${HOME}/miniforge3/bin/:$PATH
+## conda remove -n ${conda_env_name} --all -y
+## conda create -n ${conda_env_name} python=3.12 -y
+# source activate ${conda_env_name}
+#
+# sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+## conda install -c conda-forge nodejs=22.6.0 -y
+# npm install && npm ci && npx playwright install --with-deps
+# node -v && npm -v && pip list
+#
+# exit_status=0
+# npx playwright test || exit_status=$?
+#
+# if [ $exit_status -ne 0 ]; then
+# echo "[TEST INFO]: ---------frontend test failed---------"
+# exit $exit_status
+# else
+# echo "[TEST INFO]: ---------frontend test passed---------"
+# fi
+#}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/intel/hpu/gaudi
+ docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+ stop_docker
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+ start_services
+
+ validate_megaservice
+ # validate_frontend
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
diff --git a/AudioQnA/tests/test_compose_tgi_on_xeon.sh b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
new file mode 100644
index 0000000000..d735c87b94
--- /dev/null
+++ b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+ opea_branch=${opea_branch:-"main"}
+ # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+ if [[ "${opea_branch}" != "main" ]]; then
+ cd $WORKPATH
+ OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+ NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+ find . -type f -name "Dockerfile*" | while read -r file; do
+ echo "Processing file: $file"
+ sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+ done
+ fi
+
+ cd $WORKPATH/docker_image_build
+ git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+ echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+ service_list="audioqna audioqna-ui whisper speecht5"
+ docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+ docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+ docker images && sleep 1s
+}
+
+function start_services() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+ export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+
+ export MEGA_SERVICE_HOST_IP=${ip_address}
+ export WHISPER_SERVER_HOST_IP=${ip_address}
+ export SPEECHT5_SERVER_HOST_IP=${ip_address}
+ export LLM_SERVER_HOST_IP=${ip_address}
+
+ export WHISPER_SERVER_PORT=7066
+ export SPEECHT5_SERVER_PORT=7055
+ export LLM_SERVER_PORT=3006
+
+ export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
+ export host_ip=${ip_address}
+
+ # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+ # Start Docker Containers
+ docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+ n=0
+ until [[ "$n" -ge 200 ]]; do
+ docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+ if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+ break
+ fi
+ sleep 5s
+ n=$((n+1))
+ done
+}
+
+
+function validate_megaservice() {
+ response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+ # always print the log
+ docker logs whisper-service > $LOG_PATH/whisper-service.log
+ docker logs speecht5-service > $LOG_PATH/tts-service.log
+ docker logs tgi-service > $LOG_PATH/tgi-service.log
+ docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
+ echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+ if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+ echo "Result correct."
+ else
+ echo "Result wrong."
+ exit 1
+ fi
+
+}
+
+#function validate_frontend() {
+# cd $WORKPATH/ui/svelte
+# local conda_env_name="OPEA_e2e"
+# export PATH=${HOME}/miniforge3/bin/:$PATH
+## conda remove -n ${conda_env_name} --all -y
+## conda create -n ${conda_env_name} python=3.12 -y
+# source activate ${conda_env_name}
+#
+# sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+## conda install -c conda-forge nodejs=22.6.0 -y
+# npm install && npm ci && npx playwright install --with-deps
+# node -v && npm -v && pip list
+#
+# exit_status=0
+# npx playwright test || exit_status=$?
+#
+# if [ $exit_status -ne 0 ]; then
+# echo "[TEST INFO]: ---------frontend test failed---------"
+# exit $exit_status
+# else
+# echo "[TEST INFO]: ---------frontend test passed---------"
+# fi
+#}
+
+function stop_docker() {
+ cd $WORKPATH/docker_compose/intel/cpu/xeon/
+ docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+ stop_docker
+ if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+ start_services
+
+ validate_megaservice
+ # validate_frontend
+
+ stop_docker
+ echo y | docker system prune
+
+}
+
+main
From a355478ef05d465f2c560fab6db7305d3393aba6 Mon Sep 17 00:00:00 2001
From: Louie Tsai
Date: Thu, 13 Mar 2025 23:18:29 -0700
Subject: [PATCH 065/226] [ChatQnA] Enable Prometheus and Grafana with
telemetry docker compose file. (#1623)
Signed-off-by: Tsai, Louie
Signed-off-by: Chingis Yundunov
---
ChatQnA/README.md | 4 +-
.../docker_compose/intel/cpu/xeon/README.md | 4 +-
.../intel/cpu/xeon/compose.telemetry.yaml | 61 ++++++++++++++++--
.../intel/cpu/xeon/compose_tgi.telemetry.yaml | 61 ++++++++++++++++--
.../dashboards/download_opea_dashboard.sh | 6 ++
.../provisioning/dashboards/local.yaml | 14 +++++
.../provisioning/datasources/datasource.yml | 54 ++++++++++++++++
.../intel/cpu/xeon/prometheus.yaml | 43 +++++++++++++
.../docker_compose/intel/cpu/xeon/set_env.sh | 2 +
.../docker_compose/intel/hpu/gaudi/README.md | 3 +
.../intel/hpu/gaudi/compose.telemetry.yaml | 63 ++++++++++++++++++-
.../hpu/gaudi/compose_tgi.telemetry.yaml | 63 ++++++++++++++++++-
.../dashboards/download_opea_dashboard.sh | 7 +++
.../provisioning/dashboards/local.yaml | 14 +++++
.../provisioning/datasources/datasource.yml | 54 ++++++++++++++++
.../intel/hpu/gaudi/prometheus.yaml | 47 ++++++++++++++
.../docker_compose/intel/hpu/gaudi/set_env.sh | 1 +
17 files changed, 488 insertions(+), 13 deletions(-)
create mode 100644 ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh
create mode 100644 ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml
create mode 100644 ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml
create mode 100644 ChatQnA/docker_compose/intel/cpu/xeon/prometheus.yaml
create mode 100644 ChatQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh
create mode 100644 ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml
create mode 100644 ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml
create mode 100644 ChatQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml
diff --git a/ChatQnA/README.md b/ChatQnA/README.md
index 40fdac003a..50fd79d324 100644
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -70,11 +70,11 @@ To set up environment variables for deploying ChatQnA services, follow these ste
# on Gaudi
cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
source ./set_env.sh
- export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
+ export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-node-exporter-1
# on Xeon
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
source ./set_env.sh
- export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
+ export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,xeon-node-exporter-1
# on Nvidia GPU
cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
source ./set_env.sh
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index f8475e94d0..6ba093216e 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -59,8 +59,10 @@ docker compose up -d
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
CPU example with Open Telemetry feature:
+> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands.
+
```bash
-cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+./grafana/dashboards/download_opea_dashboard.sh
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
```
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
index 4da33d6d50..4456fee747 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
@@ -4,10 +4,19 @@
services:
tei-embedding-service:
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
tei-reranking-service:
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+# vllm-service:
+# command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --otlp-traces-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+ chatqna-xeon-backend-server:
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
jaeger:
- image: jaegertracing/all-in-one:latest
+ image: jaegertracing/all-in-one:1.67.0
container_name: jaeger
ports:
- "16686:16686"
@@ -21,7 +30,51 @@ services:
https_proxy: ${https_proxy}
COLLECTOR_ZIPKIN_HOST_PORT: 9411
restart: unless-stopped
- chatqna-xeon-backend-server:
+ prometheus:
+ image: prom/prometheus:v2.52.0
+ container_name: prometheus
+ user: root
+ volumes:
+ - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+ - ./prometheus_data:/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yaml'
+ ports:
+ - '9090:9090'
+ ipc: host
+ restart: unless-stopped
+ grafana:
+ image: grafana/grafana:11.0.0
+ container_name: grafana
+ volumes:
+ - ./grafana_data:/var/lib/grafana
+ - ./grafana/dashboards:/var/lib/grafana/dashboards
+ - ./grafana/provisioning:/etc/grafana/provisioning
+ user: root
environment:
- - ENABLE_OPEA_TELEMETRY=true
- - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+ GF_SECURITY_ADMIN_PASSWORD: admin
+ GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+ GF_LOG_FILTERS: rendering:debug
+ depends_on:
+ - prometheus
+ ports:
+ - '3000:3000'
+ ipc: host
+ restart: unless-stopped
+ node-exporter:
+ image: prom/node-exporter
+ container_name: node-exporter
+ volumes:
+ - /proc:/host/proc:ro
+ - /sys:/host/sys:ro
+ - /:/rootfs:ro
+ command:
+ - '--path.procfs=/host/proc'
+ - '--path.sysfs=/host/sys'
+ - --collector.filesystem.ignored-mount-points
+ - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+ ports:
+ - 9100:9100
+ restart: always
+ deploy:
+ mode: global
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
index 2ba1375398..dfd263d305 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
@@ -4,12 +4,21 @@
services:
tei-embedding-service:
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
tei-reranking-service:
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
tgi-service:
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+ chatqna-xeon-backend-server:
+ environment:
+ - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
jaeger:
- image: jaegertracing/all-in-one:latest
+ image: jaegertracing/all-in-one:1.67.0
container_name: jaeger
ports:
- "16686:16686"
@@ -23,7 +32,51 @@ services:
https_proxy: ${https_proxy}
COLLECTOR_ZIPKIN_HOST_PORT: 9411
restart: unless-stopped
- chatqna-xeon-backend-server:
+ prometheus:
+ image: prom/prometheus:v2.52.0
+ container_name: prometheus
+ user: root
+ volumes:
+ - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+ - ./prometheus_data:/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yaml'
+ ports:
+ - '9090:9090'
+ ipc: host
+ restart: unless-stopped
+ grafana:
+ image: grafana/grafana:11.0.0
+ container_name: grafana
+ volumes:
+ - ./grafana_data:/var/lib/grafana
+ - ./grafana/dashboards:/var/lib/grafana/dashboards
+ - ./grafana/provisioning:/etc/grafana/provisioning
+ user: root
environment:
- - ENABLE_OPEA_TELEMETRY=true
- - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+ GF_SECURITY_ADMIN_PASSWORD: admin
+ GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+ GF_LOG_FILTERS: rendering:debug
+ depends_on:
+ - prometheus
+ ports:
+ - '3000:3000'
+ ipc: host
+ restart: unless-stopped
+ node-exporter:
+ image: prom/node-exporter
+ container_name: node-exporter
+ volumes:
+ - /proc:/host/proc:ro
+ - /sys:/host/sys:ro
+ - /:/rootfs:ro
+ command:
+ - '--path.procfs=/host/proc'
+ - '--path.sysfs=/host/sys'
+ - --collector.filesystem.ignored-mount-points
+ - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+ ports:
+ - 9100:9100
+ restart: always
+ deploy:
+ mode: global
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh
new file mode 100644
index 0000000000..9b603c0403
--- /dev/null
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh
@@ -0,0 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml
new file mode 100644
index 0000000000..13922a769b
--- /dev/null
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml
@@ -0,0 +1,14 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: 1
+
+providers:
+- name: 'default'
+ orgId: 1
+ folder: ''
+ type: file
+ disableDeletion: false
+ updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
+ options:
+ path: /var/lib/grafana/dashboards
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml
new file mode 100644
index 0000000000..109fc0978f
--- /dev/null
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,54 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# config file version
+apiVersion: 1
+
+# list of datasources that should be deleted from the database
+deleteDatasources:
+ - name: Prometheus
+ orgId: 1
+
+# list of datasources to insert/update depending
+# what's available in the database
+datasources:
+ # name of the datasource. Required
+- name: Prometheus
+ # datasource type. Required
+ type: prometheus
+ # access mode. direct or proxy. Required
+ access: proxy
+ # org id. will default to orgId 1 if not specified
+ orgId: 1
+ # url
+ url: http://prometheus:9090
+ # database password, if used
+ password:
+ # database user, if used
+ user:
+ # database name, if used
+ database:
+ # enable/disable basic auth
+ basicAuth: false
+ # basic auth username, if used
+ basicAuthUser:
+ # basic auth password, if used
+ basicAuthPassword:
+ # enable/disable with credentials headers
+ withCredentials:
+ # mark as default datasource. Max one per org
+ isDefault: true
+ #