diff --git a/deployment/live/monitoring/ci/terragrunt.hcl b/deployment/live/monitoring/ci/terragrunt.hcl index 94ba5aa..5d0260e 100644 --- a/deployment/live/monitoring/ci/terragrunt.hcl +++ b/deployment/live/monitoring/ci/terragrunt.hcl @@ -6,6 +6,7 @@ include "root" { inputs = merge( include.root.locals, { + alert_lt_num_witness_threshold = 0 } ) diff --git a/deployment/live/monitoring/dev/terragrunt.hcl b/deployment/live/monitoring/dev/terragrunt.hcl index 94ba5aa..5d0260e 100644 --- a/deployment/live/monitoring/dev/terragrunt.hcl +++ b/deployment/live/monitoring/dev/terragrunt.hcl @@ -6,6 +6,7 @@ include "root" { inputs = merge( include.root.locals, { + alert_lt_num_witness_threshold = 0 } ) diff --git a/deployment/live/monitoring/prod/terragrunt.hcl b/deployment/live/monitoring/prod/terragrunt.hcl index 94ba5aa..8acb465 100644 --- a/deployment/live/monitoring/prod/terragrunt.hcl +++ b/deployment/live/monitoring/prod/terragrunt.hcl @@ -6,6 +6,8 @@ include "root" { inputs = merge( include.root.locals, { + alert_lt_num_witness_threshold = 10 + alert_enable_num_witness = false } ) diff --git a/deployment/live/monitoring/terragrunt.hcl b/deployment/live/monitoring/terragrunt.hcl index bb08359..21a805c 100644 --- a/deployment/live/monitoring/terragrunt.hcl +++ b/deployment/live/monitoring/terragrunt.hcl @@ -3,9 +3,10 @@ terraform { } locals { - project_id = "checkpoint-distributor" - region = "us-central1" - env = path_relative_to_include() + project_id = "checkpoint-distributor" + region = "us-central1" + env = path_relative_to_include() + alert_enable_num_witness = true } remote_state { diff --git a/deployment/modules/monitoring/main.tf b/deployment/modules/monitoring/main.tf index c6f4da3..e56123e 100644 --- a/deployment/modules/monitoring/main.tf +++ b/deployment/modules/monitoring/main.tf @@ -83,6 +83,12 @@ resource "google_monitoring_dashboard" "witness_dashboard" { "scale": "LINEAR" } } + }, + { + "title": "Witness liveness alert chart", + "alertChart": { + "name": "${google_monitoring_alert_policy.witness_liveness.name}" + } } ] } @@ -90,6 +96,33 @@ resource "google_monitoring_dashboard" "witness_dashboard" { EOF } +resource "google_monitoring_alert_policy" "witness_liveness" { + enabled = var.alert_enable_num_witness + display_name = "Number of live witnesses (${var.env})" + combiner = "OR" + conditions { + display_name = "Number of live witnesses < ${var.alert_lt_num_witness_threshold}" + condition_monitoring_query_language { + # First group by both witness_id and instanceId, since the metric for + # each witness is reported by multiple instances. Then, since the + # timeseries across instances overlap, take the average. This ensures + # that the count for each witness is not double-counted across instances. + query = <<-EOT + fetch prometheus_target + | metric + 'prometheus.googleapis.com/distributor_update_checkpoint_success/counter' + | filter (resource.namespace == 'distributor-service-${var.env}') + | align rate(1m) + | every 1m + | group_by [metric.witness_id, metric.instanceId], [row_count: row_count()] + | group_by [metric.witness_id], [row_count_mean: mean(row_count)] + | lt(${var.alert_lt_num_witness_threshold}) + EOT + duration = "1800s" + } + } +} + resource "google_monitoring_alert_policy" "receiving_updates" { display_name = "Receiving Updates (${var.env})" combiner = "OR" diff --git a/deployment/modules/monitoring/variables.tf b/deployment/modules/monitoring/variables.tf index 0b9a24f..2836bef 100644 --- a/deployment/modules/monitoring/variables.tf +++ b/deployment/modules/monitoring/variables.tf @@ -29,3 +29,12 @@ variable "env" { type = string } +variable "alert_lt_num_witness_threshold" { + description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric." + type = number +} + +variable "alert_enable_num_witness" { + description = "Whether to enable alert_lt_num_witness_threshold." + type = bool +} \ No newline at end of file