From 199c96c243aec98782067742e8a9130beb66031d Mon Sep 17 00:00:00 2001 From: Al Cutter Date: Thu, 19 Sep 2024 16:10:03 +0100 Subject: [PATCH] Add some simple device metrics to dashboard (#220) --- deployment/live/monitoring/ci/terragrunt.hcl | 1 + deployment/live/monitoring/dev/terragrunt.hcl | 1 + .../live/monitoring/prod/terragrunt.hcl | 1 + deployment/modules/monitoring/main.tf | 35 +++++++++++++++++++ deployment/modules/monitoring/variables.tf | 7 +++- 5 files changed, 44 insertions(+), 1 deletion(-) diff --git a/deployment/live/monitoring/ci/terragrunt.hcl b/deployment/live/monitoring/ci/terragrunt.hcl index 5d0260e..d3a4568 100644 --- a/deployment/live/monitoring/ci/terragrunt.hcl +++ b/deployment/live/monitoring/ci/terragrunt.hcl @@ -7,6 +7,7 @@ inputs = merge( include.root.locals, { alert_lt_num_witness_threshold = 0 + num_expected_devices = 5 } ) diff --git a/deployment/live/monitoring/dev/terragrunt.hcl b/deployment/live/monitoring/dev/terragrunt.hcl index 5d0260e..e7f247c 100644 --- a/deployment/live/monitoring/dev/terragrunt.hcl +++ b/deployment/live/monitoring/dev/terragrunt.hcl @@ -7,6 +7,7 @@ inputs = merge( include.root.locals, { alert_lt_num_witness_threshold = 0 + num_expected_devices = 2 } ) diff --git a/deployment/live/monitoring/prod/terragrunt.hcl b/deployment/live/monitoring/prod/terragrunt.hcl index 8acb465..979b22c 100644 --- a/deployment/live/monitoring/prod/terragrunt.hcl +++ b/deployment/live/monitoring/prod/terragrunt.hcl @@ -8,6 +8,7 @@ inputs = merge( { alert_lt_num_witness_threshold = 10 alert_enable_num_witness = false + num_expected_devices = 15 } ) diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index 2475c2a..45cfaa7 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -84,6 +84,41 @@ resource "google_monitoring_dashboard" "witness_dashboard" { } } }, + { + "title": "Devices seen online", + "xyChart": { + "dataSets": [{ + "timeSeriesQuery": { + "prometheusQuery": "count by (witness_id) (max by (instance_id, witness_id) (rate(distributor_update_checkpoint_request{configuration_name='distributor-service-${var.env}'}[$${__interval}]) > bool 0))" + }, + "plotType": "STACKED_AREA" + }], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Devices", + "scale": "LINEAR" + } + } + }, + { + "title": "% online (assuming ${var.num_expected_devices} devices)", + "xyChart": { + "dataSets": [{ + "timeSeriesQuery": { + "prometheusQuery": "count by (instance_id) (max by (instance_id, witness_id) (rate(distributor_update_checkpoint_request{configuration_name='distributor-service-${var.env}'}[$${__interval}]) > bool 0)) * 100 / ${var.num_expected_devices}" + }, + "plotType": "STACKED_AREA" + }], + "thresholds": [{ + "value": 51 + }], + "timeshiftDuration": "0s", + "yAxis": { + "label": "%", + "scale": "LINEAR" + } + } + }, { "title": "Witness liveness alert chart", "alertChart": { diff --git a/deployment/modules/monitoring/variables.tf b/deployment/modules/monitoring/variables.tf index 2836bef..47540e4 100644 --- a/deployment/modules/monitoring/variables.tf +++ b/deployment/modules/monitoring/variables.tf @@ -29,6 +29,11 @@ variable "env" { type = string } +variable "num_expected_devices" { + description = "Number of expected devices" + type = number +} + variable "alert_lt_num_witness_threshold" { description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric." type = number @@ -37,4 +42,4 @@ variable "alert_lt_num_witness_threshold" { variable "alert_enable_num_witness" { description = "Whether to enable alert_lt_num_witness_threshold." type = bool -} \ No newline at end of file +}