opea-project
diff --git a/‎AgentQnA/docker_compose/amd/gpu/rocm/launch_agent_service_tgi_rocm.sh
Lines changed: 1 addition & 1 deletion b/‎AgentQnA/docker_compose/amd/gpu/rocm/launch_agent_service_tgi_rocm.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎AgentQnA/docker_compose/amd/gpu/rocm/set_env.sh
Lines changed: 1 addition & 1 deletion b/‎AgentQnA/docker_compose/amd/gpu/rocm/set_env.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
Lines changed: 1 addition & 1 deletion b/‎AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
Lines changed: 1 addition & 1 deletion b/‎AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎AudioQnA/kubernetes/gmc/README.md
Lines changed: 2 additions & 2 deletions b/‎AudioQnA/kubernetes/gmc/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎AudioQnA/tests/test_compose_on_rocm.sh
Lines changed: 2 additions & 2 deletions b/‎AudioQnA/tests/test_compose_on_rocm.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎ChatQnA/benchmark/accuracy/README.md
Lines changed: 2 additions & 2 deletions b/‎ChatQnA/benchmark/accuracy/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
Lines changed: 1 addition & 1 deletion b/‎ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎ChatQnA/docker_compose/amd/gpu/rocm/README.md
Lines changed: 1 addition & 1 deletion b/‎ChatQnA/docker_compose/amd/gpu/rocm/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎ChatQnA/docker_compose/intel/hpu/gaudi/README.md
Lines changed: 8 additions & 8 deletions b/‎ChatQnA/docker_compose/intel/hpu/gaudi/README.md
Lines changed: 8 additions & 8 deletions
diff --git a/‎ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
Lines changed: 1 addition & 1 deletion b/‎ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
Lines changed: 1 addition & 1 deletion b/‎ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
Lines changed: 1 addition & 1 deletion
@@ -4,7 +4,7 @@
 WORKPATH=$(dirname "$PWD")/..
 export ip_address=${host_ip}
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
 export AGENTQNA_TGI_SERVICE_PORT="8085"
 
 # LLM related environment variables
 
@@ -6,7 +6,7 @@
 WORKPATH=$(dirname "$PWD")/..
 export ip_address=${host_ip}
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
 export AGENTQNA_TGI_SERVICE_PORT="19001"
 
 # LLM related environment variables
 
@@ -3,7 +3,7 @@
 
 services:
   tgi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: tgi-server
     ports:
       - "8085:80"
 
@@ -25,7 +25,7 @@ services:
       https_proxy: ${https_proxy}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     container_name: tgi-service
     ports:
       - "3006:80"
 
@@ -14,7 +14,7 @@ The AudioQnA application is defined as a Custom Resource (CR) file that the abov
 
 The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
 
-- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
+- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.1
 - llm: opea/llm-textgen:latest
 - asr: opea/asr:latest
 - whisper: opea/whisper:latest
@@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
 For Gaudi:
 
-- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.6
+- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.3.1
 - whisper-gaudi: opea/whisper-gaudi:latest
 - speecht5-gaudi: opea/speecht5-gaudi:latest
 
 
@@ -34,8 +34,8 @@ function build_docker_images() {
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
     service_list="audioqna audioqna-ui whisper speecht5"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-    echo "docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    echo "docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     docker images && sleep 1s
 }
 
 
@@ -45,10 +45,10 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
 ```
 # please set your llm_port and hf_token
 
-docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
 
 # for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
-docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
 ```
 
 ### Prepare Dataset
 
@@ -19,7 +19,7 @@ docker run -it --rm \
     --ipc=host \
     -e HTTPS_PROXY=$https_proxy \
     -e HTTP_PROXY=$https_proxy \
-    ghcr.io/huggingface/tgi-gaudi:2.0.6 \
+    ghcr.io/huggingface/tgi-gaudi:2.3.1 \
     --model-id $model_name \
     --max-input-tokens $max_input_tokens \
     --max-total-tokens $max_total_tokens \
 
@@ -190,7 +190,7 @@ Change the `xxx_MODEL_ID` below for your needs.
    # Example: NGINX_PORT=80
    export HOST_IP=${host_ip}
    export NGINX_PORT=${your_nginx_port}
-   export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+   export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
    export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
    export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base"
    export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 
@@ -158,7 +158,7 @@ The default deployment utilizes Gaudi devices primarily for the `vllm-service`,
 
 ### compose_tgi.yaml - TGI Deployment
 
-The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
+The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
 
 | Service Name                 | Image Name                                            | Gaudi Specific |
 | ---------------------------- | ----------------------------------------------------- | -------------- |
@@ -167,7 +167,7 @@ The TGI (Text Generation Inference) deployment and the default deployment differ
 | tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             |
 | retriever                    | opea/retriever:latest                                 | No             |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card         |
-| **tgi-service**              | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Configurable   |
+| **tgi-service**              | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | Configurable   |
 | chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No             |
 | chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No             |
 | chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No             |
@@ -178,7 +178,7 @@ This deployment may allocate more Gaudi resources to the tgi-service to optimize
 
 The FAQs(frequently asked questions and answers) generation Deployment will generate FAQs instead of normally text generation. It add a new microservice called `llm-faqgen`, which is a microservice that interacts with the TGI/vLLM LLM server to generate FAQs from input text.
 
-The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
+The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
 
 | Service Name                 | Image Name                                            | Gaudi Use    |
 | ---------------------------- | ----------------------------------------------------- | ------------ |
@@ -214,13 +214,13 @@ This setup might allow for more Gaudi devices to be dedicated to the `vllm-servi
 
 ### compose_guardrails.yaml - Guardrails Deployment
 
-The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
+The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
 
 | Service Name                 | Image Name                                            | Gaudi Specific | Uses LLM |
 | ---------------------------- | ----------------------------------------------------- | -------------- | -------- |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             | No       |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No             | No       |
-| _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | 1 card         | Yes      |
+| _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | 1 card         | Yes      |
 | _guardrails_                 | opea/guardrails:latest                                | No             | No       |
 | tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             | No       |
 | retriever                    | opea/retriever:latest                                 | No             | No       |
@@ -262,8 +262,8 @@ The table provides a comprehensive overview of the ChatQnA services utilized acr
 | retriever                    | opea/retriever:latest                                 | No       | Retrieves data from the Redis database and interacts with embedding services.                      |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | Yes      | Reranks text embeddings, typically using Gaudi hardware for enhanced performance.                  |
 | vllm-service                 | opea/vllm-gaudi:latest                                | No       | Handles large language model (LLM) tasks, utilizing Gaudi hardware.                                |
-| tgi-service                  | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Yes      | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware.         |
-| tgi-guardrails-service       | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Yes      | Provides guardrails functionality, ensuring safe operations within defined limits.                 |
+| tgi-service                  | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | Yes      | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware.         |
+| tgi-guardrails-service       | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | Yes      | Provides guardrails functionality, ensuring safe operations within defined limits.                 |
 | guardrails                   | opea/guardrails:latest                                | Yes      | Acts as a safety layer, interfacing with the `tgi-guardrails-service` to enforce safety protocols. |
 | chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No       | Serves as the backend for the ChatQnA application, with variations depending on the deployment.    |
 | chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No       | Provides the user interface for the ChatQnA application.                                           |
@@ -288,7 +288,7 @@ The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.6` image supporting `te
 
 ### tgi-gaurdrails-service
 
-The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.0.6` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.
+The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.3.1` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.
 
 ## Conclusion
 
 
@@ -26,7 +26,7 @@ services:
       TEI_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
   tgi-guardrails-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: tgi-guardrails-server
     ports:
       - "8088:80"
 
@@ -80,7 +80,7 @@ services:
       MAX_WARMUP_SEQUENCE_LENGTH: 512
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
   tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: tgi-gaudi-server
     ports:
       - "8005:80"