From 03d32bd7c986cd41c4fe664f3a90b871e3bcf868 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 10 Apr 2024 11:02:58 +0000 Subject: [PATCH 1/5] Define liveness alert policy and add its graph to the dashboard. --- deployment/live/monitoring/ci/terragrunt.hcl | 1 + deployment/live/monitoring/dev/terragrunt.hcl | 1 + .../live/monitoring/prod/terragrunt.hcl | 1 + deployment/modules/monitoring/main.tf | 28 +++++++++++++++++++ deployment/modules/monitoring/variables.tf | 4 +++ 5 files changed, 35 insertions(+) diff --git a/deployment/live/monitoring/ci/terragrunt.hcl b/deployment/live/monitoring/ci/terragrunt.hcl index 94ba5aa..d734450 100644 --- a/deployment/live/monitoring/ci/terragrunt.hcl +++ b/deployment/live/monitoring/ci/terragrunt.hcl @@ -6,6 +6,7 @@ include "root" { inputs = merge( include.root.locals, { + lt_num_witness_threshold = 0 } ) diff --git a/deployment/live/monitoring/dev/terragrunt.hcl b/deployment/live/monitoring/dev/terragrunt.hcl index 94ba5aa..d734450 100644 --- a/deployment/live/monitoring/dev/terragrunt.hcl +++ b/deployment/live/monitoring/dev/terragrunt.hcl @@ -6,6 +6,7 @@ include "root" { inputs = merge( include.root.locals, { + lt_num_witness_threshold = 0 } ) diff --git a/deployment/live/monitoring/prod/terragrunt.hcl b/deployment/live/monitoring/prod/terragrunt.hcl index 94ba5aa..8b709e1 100644 --- a/deployment/live/monitoring/prod/terragrunt.hcl +++ b/deployment/live/monitoring/prod/terragrunt.hcl @@ -6,6 +6,7 @@ include "root" { inputs = merge( include.root.locals, { + lt_num_witness_threshold = 10 } ) diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index c6f4da3..b3d67a2 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -83,6 +83,12 @@ resource "google_monitoring_dashboard" "witness_dashboard" { "scale": "LINEAR" } } + }, + { + "title": "Witness liveness alert chart", + "alertChart": { + "name": "${google_monitoring_alert_policy.witness_liveness.name}" + } } ] } @@ -90,6 +96,28 @@ resource "google_monitoring_dashboard" "witness_dashboard" { EOF } +resource "google_monitoring_alert_policy" "witness_liveness" { + display_name = "Number of live witnesses (${var.env})" + combiner = "OR" + conditions { + display_name = "Number of live witnesses < ${var.lt_num_witness_threshold}" + condition_monitoring_query_language { + query = <<-EOT + fetch prometheus_target + | metric + 'prometheus.googleapis.com/distributor_update_checkpoint_success/counter' + | filter (resource.namespace == 'distributor-service-${var.env}') + | align rate(1m) + | every 1m + | group_by [metric.witness_id, metric.instanceId], [row_count: row_count()] + | group_by [metric.witness_id], [row_count_mean: mean(row_count)] + | lt(${var.lt_num_witness_threshold}) + EOT + duration = "1800s" + } + } +} + resource "google_monitoring_alert_policy" "receiving_updates" { display_name = "Receiving Updates (${var.env})" combiner = "OR" diff --git a/deployment/modules/monitoring/variables.tf b/deployment/modules/monitoring/variables.tf index 0b9a24f..6dca144 100644 --- a/deployment/modules/monitoring/variables.tf +++ b/deployment/modules/monitoring/variables.tf @@ -29,3 +29,7 @@ variable "env" { type = string } +variable "lt_num_witness_threshold" { + description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric." + type = number +} From 5a82f46f67dde88bb4fd0f67d1b3c4b475eb1d14 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 10 Apr 2024 11:05:55 +0000 Subject: [PATCH 2/5] Add comment. --- deployment/modules/monitoring/main.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index b3d67a2..68c3bef 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -102,6 +102,10 @@ resource "google_monitoring_alert_policy" "witness_liveness" { conditions { display_name = "Number of live witnesses < ${var.lt_num_witness_threshold}" condition_monitoring_query_language { + # First group by both witness_id and instanceId, since the metric for + # each witness is reported by multiple instances. Then, since the + # timeseries across instances overlap, take the average. This ensures + # that the count for each witness is not double-counted across instances. query = <<-EOT fetch prometheus_target | metric From 128d4f4b2240d72ef94660e7458a0e1e1964cbaf Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 10 Apr 2024 20:15:51 +0000 Subject: [PATCH 3/5] Rename alert threshold var. --- deployment/live/monitoring/ci/terragrunt.hcl | 2 +- deployment/live/monitoring/dev/terragrunt.hcl | 2 +- deployment/live/monitoring/prod/terragrunt.hcl | 2 +- deployment/modules/monitoring/main.tf | 4 ++-- deployment/modules/monitoring/variables.tf | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deployment/live/monitoring/ci/terragrunt.hcl b/deployment/live/monitoring/ci/terragrunt.hcl index d734450..5d0260e 100644 --- a/deployment/live/monitoring/ci/terragrunt.hcl +++ b/deployment/live/monitoring/ci/terragrunt.hcl @@ -6,7 +6,7 @@ include "root" { inputs = merge( include.root.locals, { - lt_num_witness_threshold = 0 + alert_lt_num_witness_threshold = 0 } ) diff --git a/deployment/live/monitoring/dev/terragrunt.hcl b/deployment/live/monitoring/dev/terragrunt.hcl index d734450..5d0260e 100644 --- a/deployment/live/monitoring/dev/terragrunt.hcl +++ b/deployment/live/monitoring/dev/terragrunt.hcl @@ -6,7 +6,7 @@ include "root" { inputs = merge( include.root.locals, { - lt_num_witness_threshold = 0 + alert_lt_num_witness_threshold = 0 } ) diff --git a/deployment/live/monitoring/prod/terragrunt.hcl b/deployment/live/monitoring/prod/terragrunt.hcl index 8b709e1..ab22c8b 100644 --- a/deployment/live/monitoring/prod/terragrunt.hcl +++ b/deployment/live/monitoring/prod/terragrunt.hcl @@ -6,7 +6,7 @@ include "root" { inputs = merge( include.root.locals, { - lt_num_witness_threshold = 10 + alert_lt_num_witness_threshold = 10 } ) diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index 68c3bef..3cd3ff8 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -100,7 +100,7 @@ resource "google_monitoring_alert_policy" "witness_liveness" { display_name = "Number of live witnesses (${var.env})" combiner = "OR" conditions { - display_name = "Number of live witnesses < ${var.lt_num_witness_threshold}" + display_name = "Number of live witnesses < ${var.alert_lt_num_witness_threshold}" condition_monitoring_query_language { # First group by both witness_id and instanceId, since the metric for # each witness is reported by multiple instances. Then, since the @@ -115,7 +115,7 @@ resource "google_monitoring_alert_policy" "witness_liveness" { | every 1m | group_by [metric.witness_id, metric.instanceId], [row_count: row_count()] | group_by [metric.witness_id], [row_count_mean: mean(row_count)] - | lt(${var.lt_num_witness_threshold}) + | lt(${var.alert_lt_num_witness_threshold}) EOT duration = "1800s" } diff --git a/deployment/modules/monitoring/variables.tf b/deployment/modules/monitoring/variables.tf index 6dca144..c60bf96 100644 --- a/deployment/modules/monitoring/variables.tf +++ b/deployment/modules/monitoring/variables.tf @@ -29,7 +29,7 @@ variable "env" { type = string } -variable "lt_num_witness_threshold" { +variable "alert_lt_num_witness_threshold" { description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric." type = number } From a26fb1d3486f69d2537a7a293c9f3cb77cb87b94 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 11 Apr 2024 09:30:16 +0000 Subject: [PATCH 4/5] Disable the witness liveness alert in prod for now. --- deployment/live/monitoring/prod/terragrunt.hcl | 1 + deployment/live/monitoring/terragrunt.hcl | 7 ++++--- deployment/modules/monitoring/main.tf | 1 + deployment/modules/monitoring/variables.tf | 5 +++++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/deployment/live/monitoring/prod/terragrunt.hcl b/deployment/live/monitoring/prod/terragrunt.hcl index ab22c8b..8acb465 100644 --- a/deployment/live/monitoring/prod/terragrunt.hcl +++ b/deployment/live/monitoring/prod/terragrunt.hcl @@ -7,6 +7,7 @@ inputs = merge( include.root.locals, { alert_lt_num_witness_threshold = 10 + alert_enable_num_witness = false } ) diff --git a/deployment/live/monitoring/terragrunt.hcl b/deployment/live/monitoring/terragrunt.hcl index bb08359..21a805c 100644 --- a/deployment/live/monitoring/terragrunt.hcl +++ b/deployment/live/monitoring/terragrunt.hcl @@ -3,9 +3,10 @@ terraform { } locals { - project_id = "checkpoint-distributor" - region = "us-central1" - env = path_relative_to_include() + project_id = "checkpoint-distributor" + region = "us-central1" + env = path_relative_to_include() + alert_enable_num_witness = true } remote_state { diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index 3cd3ff8..8e6b561 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -97,6 +97,7 @@ resource "google_monitoring_dashboard" "witness_dashboard" { } resource "google_monitoring_alert_policy" "witness_liveness" { + enabled = var.alert_enable_num_witness display_name = "Number of live witnesses (${var.env})" combiner = "OR" conditions { diff --git a/deployment/modules/monitoring/variables.tf b/deployment/modules/monitoring/variables.tf index c60bf96..2836bef 100644 --- a/deployment/modules/monitoring/variables.tf +++ b/deployment/modules/monitoring/variables.tf @@ -33,3 +33,8 @@ variable "alert_lt_num_witness_threshold" { description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric." type = number } + +variable "alert_enable_num_witness" { + description = "Whether to enable alert_lt_num_witness_threshold." + type = bool +} \ No newline at end of file From 199f3b60205c26f0a4445e0be4f5db4c52ed8853 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 11 Apr 2024 09:42:57 +0000 Subject: [PATCH 5/5] Align whitespace. --- deployment/modules/monitoring/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index 8e6b561..e56123e 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -97,7 +97,7 @@ resource "google_monitoring_dashboard" "witness_dashboard" { } resource "google_monitoring_alert_policy" "witness_liveness" { - enabled = var.alert_enable_num_witness + enabled = var.alert_enable_num_witness display_name = "Number of live witnesses (${var.env})" combiner = "OR" conditions {