Add Telemetry support for AgentQnA using Grafana, Prometheus and Jaeger (#1732)

louie-tsai · web-flow · commit 11fa7d5e99ee · 2025-04-02T17:37:13.000-07:00
Signed-off-by: louie tsai &lt;louie.tsai@intel.com&gt;
Signed-off-by: Tsai, Louie &lt;louie.tsai@intel.com&gt;
diff --git a/AgentQnA/README.md b/AgentQnA/README.md
@@ -1,5 +1,13 @@
 # Agents for Question Answering
 
+## Table of contents
+
+1. [Overview](#overview)
+2. [Deploy with Docker](#deploy-with-docker)
+3. [Launch the UI](#launch-the-ui)
+4. [Validate Services](#validate-services)
+5. [Register Tools](#how-to-register-other-tools-with-the-ai-agent)
+
 ## Overview
 
 This example showcases a hierarchical multi-agent system for question-answering applications. The architecture diagram below shows a supervisor agent that interfaces with the user and dispatches tasks to two worker agents to gather information and come up with answers. The worker RAG agent uses the retrieval tool to retrieve relevant documents from a knowledge base - a vector database. The worker SQL agent retrieves relevant data from a SQL database. Although not included in this example by default, other tools such as a web search tool or a knowledge graph query tool can be used by the supervisor agent to gather information from additional sources.
@@ -134,7 +142,7 @@ source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
 source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
 ```
 
-### 3. Launch the multi-agent system. </br>
+### 2. Launch the multi-agent system. </br>
 
 Two options are provided for the `llm_engine` of the agents: 1. open-source LLMs on Gaudi, 2. OpenAI models via API calls.
 
@@ -151,11 +159,19 @@ cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
 docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
 ```
 
+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+Gaudi example with Open Telemetry feature:
+
+```bash
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 ##### [Optional] Web Search Tool Support
 
 <details>
 <summary> Instructions </summary>
-A web search tool is supported in this example and can be enabled by running docker compose with the `compose.webtool.yaml` file.  
+A web search tool is supported in this example and can be enabled by running docker compose with the `compose.webtool.yaml` file.
 The Google Search API is used. Follow the [instructions](https://python.langchain.com/docs/integrations/tools/google_search) to create an API key and enable the Custom Search API on a Google account. The environment variables `GOOGLE_CSE_ID` and `GOOGLE_API_KEY` need to be set.
 
 ```bash
@@ -179,7 +195,7 @@ cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
 docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose_openai.yaml up -d
 ```
 
-### 4. Ingest Data into the vector database
+### 3. Ingest Data into the vector database
 
 The `run_ingest_data.sh` script will use an example jsonl file to ingest example documents into a vector database. Other ways to ingest data and other types of documents supported can be found in the OPEA dataprep microservice located in the opea-project/GenAIComps repo.
 
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml
@@ -0,0 +1,93 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tei-embedding-service:
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tei-reranking-service:
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:1.67.0
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: prometheus
+    user: root
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+      - ./prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - '9091:9090'
+    ipc: host
+    restart: unless-stopped
+  grafana:
+    image: grafana/grafana:11.0.0
+    container_name: grafana
+    volumes:
+      - ./grafana_data:/var/lib/grafana
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    user: root
+    environment:
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+      GF_LOG_FILTERS: rendering:debug
+    depends_on:
+      - prometheus
+    ports:
+      - '3000:3000'
+    ipc: host
+    restart: unless-stopped
+  node-exporter:
+    image: prom/node-exporter
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
+  gaudi-exporter:
+    image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32
+    container_name: gaudi-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+      - /dev:/dev
+    ports:
+      - 41612:41611
+    restart: always
+    deploy:
+      mode: global
+  worker-rag-agent:
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+  worker-sql-agent:
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+  supervisor-react-agent:
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh
@@ -0,0 +1,10 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+rm *.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/chatqna_megaservice_grafana.json
+mv chatqna_megaservice_grafana.json agentqna_microervices_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana.json
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml
@@ -0,0 +1,14 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: 1
+
+providers:
+- name: 'default'
+  orgId: 1
+  folder: ''
+  type: file
+  disableDeletion: false
+  updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
+  options:
+    path: /var/lib/grafana/dashboards
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml b/AgentQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,54 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# config file version
+apiVersion: 1
+
+# list of datasources that should be deleted from the database
+deleteDatasources:
+  - name: Prometheus
+    orgId: 1
+
+# list of datasources to insert/update depending
+# what's available in the database
+datasources:
+  # <string, required> name of the datasource. Required
+- name: Prometheus
+  # <string, required> datasource type. Required
+  type: prometheus
+  # <string, required> access mode. direct or proxy. Required
+  access: proxy
+  # <int> org id. will default to orgId 1 if not specified
+  orgId: 1
+  # <string> url
+  url: http://prometheus:9090
+  # <string> database password, if used
+  password:
+  # <string> database user, if used
+  user:
+  # <string> database name, if used
+  database:
+  # <bool> enable/disable basic auth
+  basicAuth: false
+  # <string> basic auth username, if used
+  basicAuthUser:
+  # <string> basic auth password, if used
+  basicAuthPassword:
+  # <bool> enable/disable with credentials headers
+  withCredentials:
+  # <bool> mark as default datasource. Max one per org
+  isDefault: true
+  # <map> fields that will be converted to json and stored in json_data
+  jsonData:
+     httpMethod: GET
+     graphiteVersion: "1.1"
+     tlsAuth: false
+     tlsAuthWithCACert: false
+  # <string> json object of data that will be encrypted.
+  secureJsonData:
+    tlsCACert: "..."
+    tlsClientCert: "..."
+    tlsClientKey: "..."
+  version: 1
+  # <bool> allow users to edit datasources from the UI.
+  editable: true
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml
@@ -0,0 +1,55 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+global:
+  scrape_interval: 5s
+  external_labels:
+    monitor: "my-monitor"
+scrape_configs:
+  - job_name: "prometheus"
+    static_configs:
+      - targets: ["prometheus:9090"]
+  - job_name: "vllm"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["vllm-gaudi-server:8000"]
+  - job_name: "tgi"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tgi-gaudi-server:80"]
+  - job_name: "tei-embedding"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tei-embedding-server:80"]
+  - job_name: "tei-reranking"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tei-reranking-server:80"]
+  - job_name: "retriever"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["retriever:7000"]
+  - job_name: "dataprep-redis-service"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["dataprep-redis-service:5000"]
+  - job_name: "prometheus-node-exporter"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["node-exporter:9100"]
+  - job_name: "prometheus-gaudi-exporter"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["gaudi-exporter:41611"]
+  - job_name: "supervisor-react-agent"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["react-agent-endpoint:9090"]
+  - job_name: "worker-rag-agent"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["rag-agent-endpoint:9095"]
+  - job_name: "worker-sql-agent"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["sql-agent-endpoint:9096"]
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -64,6 +64,9 @@ export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
 export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
 export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
+# Set OpenTelemetry Tracing Endpoint
+export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
 
-
-export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$host_ip"
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,node-exporter,gaudi-exporter,127.0.0.1,localhost,0.0.0.0,$host_ip,,$JAEGER_IP"
diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -28,7 +28,7 @@ function start_tgi(){
     echo "Starting tgi-gaudi server"
     cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
     source set_env.sh
-    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml tgi_gaudi.yaml up -d
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml tgi_gaudi.yaml -f compose.telemetry.yaml up -d
 
 }
 
@@ -40,7 +40,7 @@ function start_all_services() {
     echo "**************model is $model**************"
     cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
     source set_env.sh
-    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml -f compose.telemetry.yaml up -d
     sleep 5s
     echo "Waiting vllm gaudi ready"
     n=0
diff --git a/AgentQnA/tools/tools.py b/AgentQnA/tools/tools.py
@@ -4,9 +4,11 @@
 import os
 
 import requests
+from comps.cores.telemetry.opea_telemetry import opea_telemetry, tracer
 from tools.pycragapi import CRAG
 
 
+@opea_telemetry
 def search_web_base(query: str) -> str:
     import os
 
@@ -25,6 +27,7 @@ def search_web_base(query: str) -> str:
     return response
 
 
+@opea_telemetry
 def search_knowledge_base(query: str) -> str:
     """Search a knowledge base about music and singers for a given query.
 
@@ -40,6 +43,7 @@ def search_knowledge_base(query: str) -> str:
     return response.json()["text"]
 
 
+@opea_telemetry
 def search_sql_database(query: str) -> str:
     """Search a SQL database on artists and their music with a natural language query.
 
@@ -55,32 +59,37 @@ def search_sql_database(query: str) -> str:
     return response.json()["text"]
 
 
+@opea_telemetry
 def get_grammy_best_artist_by_year(year: int) -> dict:
     """Get the Grammy Best New Artist for a specific year."""
     api = CRAG()
     year = int(year)
     return api.music_grammy_get_best_artist_by_year(year)
 
 
+@opea_telemetry
 def get_members(band_name: str) -> dict:
     """Get the member list of a band."""
     api = CRAG()
     return api.music_get_members(band_name)
 
 
+@opea_telemetry
 def get_artist_birth_place(artist_name: str) -> dict:
     """Get the birthplace of an artist."""
     api = CRAG()
     return api.music_get_artist_birth_place(artist_name)
 
 
+@opea_telemetry
 def get_billboard_rank_date(rank: int, date: str = None) -> dict:
     """Get Billboard ranking for a specific rank and date."""
     api = CRAG()
     rank = int(rank)
     return api.music_get_billboard_rank_date(rank, date)
 
 
+@opea_telemetry
 def get_song_release_date(song_name: str) -> dict:
     """Get the release date of a song."""
     api = CRAG()