Skip to content

Commit

Permalink
Define liveness alert policy and add its graph to the dashboard. (#151)
Browse files Browse the repository at this point in the history
* Define liveness alert policy and add its graph to the dashboard.

* Add comment.

* Rename alert threshold var.

* Disable the witness liveness alert in prod for now.

* Align whitespace.
  • Loading branch information
jiggoha authored Apr 11, 2024
1 parent 2cd0647 commit f87812a
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 3 deletions.
1 change: 1 addition & 0 deletions deployment/live/monitoring/ci/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ include "root" {
inputs = merge(
include.root.locals,
{
alert_lt_num_witness_threshold = 0
}
)

1 change: 1 addition & 0 deletions deployment/live/monitoring/dev/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ include "root" {
inputs = merge(
include.root.locals,
{
alert_lt_num_witness_threshold = 0
}
)

2 changes: 2 additions & 0 deletions deployment/live/monitoring/prod/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ include "root" {
inputs = merge(
include.root.locals,
{
alert_lt_num_witness_threshold = 10
alert_enable_num_witness = false
}
)

7 changes: 4 additions & 3 deletions deployment/live/monitoring/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ terraform {
}

locals {
project_id = "checkpoint-distributor"
region = "us-central1"
env = path_relative_to_include()
project_id = "checkpoint-distributor"
region = "us-central1"
env = path_relative_to_include()
alert_enable_num_witness = true
}

remote_state {
Expand Down
33 changes: 33 additions & 0 deletions deployment/modules/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,46 @@ resource "google_monitoring_dashboard" "witness_dashboard" {
"scale": "LINEAR"
}
}
},
{
"title": "Witness liveness alert chart",
"alertChart": {
"name": "${google_monitoring_alert_policy.witness_liveness.name}"
}
}
]
}
}
EOF
}

resource "google_monitoring_alert_policy" "witness_liveness" {
enabled = var.alert_enable_num_witness
display_name = "Number of live witnesses (${var.env})"
combiner = "OR"
conditions {
display_name = "Number of live witnesses < ${var.alert_lt_num_witness_threshold}"
condition_monitoring_query_language {
# First group by both witness_id and instanceId, since the metric for
# each witness is reported by multiple instances. Then, since the
# timeseries across instances overlap, take the average. This ensures
# that the count for each witness is not double-counted across instances.
query = <<-EOT
fetch prometheus_target
| metric
'prometheus.googleapis.com/distributor_update_checkpoint_success/counter'
| filter (resource.namespace == 'distributor-service-${var.env}')
| align rate(1m)
| every 1m
| group_by [metric.witness_id, metric.instanceId], [row_count: row_count()]
| group_by [metric.witness_id], [row_count_mean: mean(row_count)]
| lt(${var.alert_lt_num_witness_threshold})
EOT
duration = "1800s"
}
}
}

resource "google_monitoring_alert_policy" "receiving_updates" {
display_name = "Receiving Updates (${var.env})"
combiner = "OR"
Expand Down
9 changes: 9 additions & 0 deletions deployment/modules/monitoring/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,12 @@ variable "env" {
type = string
}

variable "alert_lt_num_witness_threshold" {
description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric."
type = number
}

variable "alert_enable_num_witness" {
description = "Whether to enable alert_lt_num_witness_threshold."
type = bool
}

0 comments on commit f87812a

Please sign in to comment.