Skip to content

Commit 176f08c

Browse files
committed
Add Telemetry support for AgentQnA using Grafana, Prometheus and Jaeger
Signed-off-by: louie tsai <[email protected]> Signed-off-by: Tsai, Louie <[email protected]>
1 parent 583428c commit 176f08c

File tree

7 files changed

+241
-4
lines changed

7 files changed

+241
-4
lines changed

AgentQnA/README.md

+10-2
Original file line numberDiff line numberDiff line change
@@ -151,19 +151,27 @@ cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
151151
docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
152152
```
153153

154+
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
155+
Gaudi example with Open Telemetry feature:
156+
157+
````bash
158+
cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
159+
docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml -f compose.telemetry.yaml up -d
160+
``
161+
154162
##### [Optional] Web Search Tool Support
155163

156164
<details>
157165
<summary> Instructions </summary>
158-
A web search tool is supported in this example and can be enabled by running docker compose with the `compose.webtool.yaml` file.
166+
A web search tool is supported in this example and can be enabled by running docker compose with the `compose.webtool.yaml` file.
159167
The Google Search API is used. Follow the [instructions](https://python.langchain.com/docs/integrations/tools/google_search) to create an API key and enable the Custom Search API on a Google account. The environment variables `GOOGLE_CSE_ID` and `GOOGLE_API_KEY` need to be set.
160168

161169
```bash
162170
cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
163171
export GOOGLE_CSE_ID="YOUR_ID"
164172
export GOOGLE_API_KEY="YOUR_API_KEY"
165173
docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml -f compose.webtool.yaml up -d
166-
```
174+
````
167175
168176
</details>
169177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
services:
5+
tei-embedding-service:
6+
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
7+
tei-reranking-service:
8+
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
9+
jaeger:
10+
image: jaegertracing/all-in-one:1.67.0
11+
container_name: jaeger
12+
ports:
13+
- "16686:16686"
14+
- "4317:4317"
15+
- "4318:4318"
16+
- "9411:9411"
17+
ipc: host
18+
environment:
19+
no_proxy: ${no_proxy}
20+
http_proxy: ${http_proxy}
21+
https_proxy: ${https_proxy}
22+
COLLECTOR_ZIPKIN_HOST_PORT: 9411
23+
restart: unless-stopped
24+
prometheus:
25+
image: prom/prometheus:v2.52.0
26+
container_name: prometheus
27+
user: root
28+
volumes:
29+
- ./prometheus.yaml:/etc/prometheus/prometheus.yaml
30+
- ./prometheus_data:/prometheus
31+
command:
32+
- '--config.file=/etc/prometheus/prometheus.yaml'
33+
ports:
34+
- '9091:9090'
35+
ipc: host
36+
restart: unless-stopped
37+
grafana:
38+
image: grafana/grafana:11.0.0
39+
container_name: grafana
40+
volumes:
41+
- ./grafana_data:/var/lib/grafana
42+
- ./grafana/dashboards:/var/lib/grafana/dashboards
43+
- ./grafana/provisioning:/etc/grafana/provisioning
44+
user: root
45+
environment:
46+
GF_SECURITY_ADMIN_PASSWORD: admin
47+
GF_RENDERING_CALLBACK_URL: http://grafana:3000/
48+
GF_LOG_FILTERS: rendering:debug
49+
depends_on:
50+
- prometheus
51+
ports:
52+
- '3000:3000'
53+
ipc: host
54+
restart: unless-stopped
55+
node-exporter:
56+
image: prom/node-exporter
57+
container_name: node-exporter
58+
volumes:
59+
- /proc:/host/proc:ro
60+
- /sys:/host/sys:ro
61+
- /:/rootfs:ro
62+
command:
63+
- '--path.procfs=/host/proc'
64+
- '--path.sysfs=/host/sys'
65+
- --collector.filesystem.ignored-mount-points
66+
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
67+
ports:
68+
- 9100:9100
69+
restart: always
70+
deploy:
71+
mode: global
72+
gaudi-exporter:
73+
image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32
74+
container_name: gaudi-exporter
75+
volumes:
76+
- /proc:/host/proc:ro
77+
- /sys:/host/sys:ro
78+
- /:/rootfs:ro
79+
- /dev:/dev
80+
ports:
81+
- 41612:41611
82+
restart: always
83+
deploy:
84+
mode: global
85+
worker-rag-agent:
86+
environment:
87+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
88+
worker-sql-agent:
89+
environment:
90+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
91+
supervisor-react-agent:
92+
environment:
93+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
rm *.json
5+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/chatqna_megaservice_grafana.json
6+
mv chatqna_megaservice_grafana.json agentqna_microervices_grafana.json
7+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
8+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
9+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
10+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana.json
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: 1
5+
6+
providers:
7+
- name: 'default'
8+
orgId: 1
9+
folder: ''
10+
type: file
11+
disableDeletion: false
12+
updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
13+
options:
14+
path: /var/lib/grafana/dashboards
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# config file version
5+
apiVersion: 1
6+
7+
# list of datasources that should be deleted from the database
8+
deleteDatasources:
9+
- name: Prometheus
10+
orgId: 1
11+
12+
# list of datasources to insert/update depending
13+
# what's available in the database
14+
datasources:
15+
# <string, required> name of the datasource. Required
16+
- name: Prometheus
17+
# <string, required> datasource type. Required
18+
type: prometheus
19+
# <string, required> access mode. direct or proxy. Required
20+
access: proxy
21+
# <int> org id. will default to orgId 1 if not specified
22+
orgId: 1
23+
# <string> url
24+
url: http://prometheus:9090
25+
# <string> database password, if used
26+
password:
27+
# <string> database user, if used
28+
user:
29+
# <string> database name, if used
30+
database:
31+
# <bool> enable/disable basic auth
32+
basicAuth: false
33+
# <string> basic auth username, if used
34+
basicAuthUser:
35+
# <string> basic auth password, if used
36+
basicAuthPassword:
37+
# <bool> enable/disable with credentials headers
38+
withCredentials:
39+
# <bool> mark as default datasource. Max one per org
40+
isDefault: true
41+
# <map> fields that will be converted to json and stored in json_data
42+
jsonData:
43+
httpMethod: GET
44+
graphiteVersion: "1.1"
45+
tlsAuth: false
46+
tlsAuthWithCACert: false
47+
# <string> json object of data that will be encrypted.
48+
secureJsonData:
49+
tlsCACert: "..."
50+
tlsClientCert: "..."
51+
tlsClientKey: "..."
52+
version: 1
53+
# <bool> allow users to edit datasources from the UI.
54+
editable: true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
global:
5+
scrape_interval: 5s
6+
external_labels:
7+
monitor: "my-monitor"
8+
scrape_configs:
9+
- job_name: "prometheus"
10+
static_configs:
11+
- targets: ["prometheus:9090"]
12+
- job_name: "vllm"
13+
metrics_path: /metrics
14+
static_configs:
15+
- targets: ["vllm-gaudi-server:8000"]
16+
- job_name: "tgi"
17+
metrics_path: /metrics
18+
static_configs:
19+
- targets: ["tgi-gaudi-server:80"]
20+
- job_name: "tei-embedding"
21+
metrics_path: /metrics
22+
static_configs:
23+
- targets: ["tei-embedding-server:80"]
24+
- job_name: "tei-reranking"
25+
metrics_path: /metrics
26+
static_configs:
27+
- targets: ["tei-reranking-server:80"]
28+
- job_name: "retriever"
29+
metrics_path: /metrics
30+
static_configs:
31+
- targets: ["retriever:7000"]
32+
- job_name: "dataprep-redis-service"
33+
metrics_path: /metrics
34+
static_configs:
35+
- targets: ["dataprep-redis-service:5000"]
36+
- job_name: "prometheus-node-exporter"
37+
metrics_path: /metrics
38+
static_configs:
39+
- targets: ["node-exporter:9100"]
40+
- job_name: "prometheus-gaudi-exporter"
41+
metrics_path: /metrics
42+
static_configs:
43+
- targets: ["gaudi-exporter:41611"]
44+
- job_name: "supervisor-react-agent"
45+
metrics_path: /metrics
46+
static_configs:
47+
- targets: ["react-agent-endpoint:9090"]
48+
- job_name: "worker-rag-agent"
49+
metrics_path: /metrics
50+
static_configs:
51+
- targets: ["rag-agent-endpoint:9095"]
52+
- job_name: "worker-sql-agent"
53+
metrics_path: /metrics
54+
static_configs:
55+
- targets: ["sql-agent-endpoint:9096"]

AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh

+5-2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
6464
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
6565
export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
6666
export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
67+
# Set OpenTelemetry Tracing Endpoint
68+
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
69+
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
70+
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
6771

68-
69-
export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$host_ip"
72+
export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,node-exporter,gaudi-exporter,127.0.0.1,localhost,0.0.0.0,$host_ip,,$JAEGER_IP"

0 commit comments

Comments
 (0)