Skip to content

Commit

Permalink
Fix final sum bug in live witnesses alert. (#153)
Browse files Browse the repository at this point in the history
  • Loading branch information
jiggoha authored Apr 15, 2024
1 parent a360cfd commit dcfa03b
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion deployment/modules/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ resource "google_monitoring_dashboard" "witness_dashboard" {

resource "google_monitoring_alert_policy" "witness_liveness" {
enabled = var.alert_enable_num_witness
display_name = "Number of live witnesses (${var.env})"
display_name = "Number of live witnesses (${var.env}) < ${var.alert_lt_num_witness_threshold}"
combiner = "OR"
conditions {
display_name = "Number of live witnesses < ${var.alert_lt_num_witness_threshold}"
Expand All @@ -107,6 +107,7 @@ resource "google_monitoring_alert_policy" "witness_liveness" {
# each witness is reported by multiple instances. Then, since the
# timeseries across instances overlap, take the average. This ensures
# that the count for each witness is not double-counted across instances.
# Finally, add all the counts together to compare against the threshold.
query = <<-EOT
fetch prometheus_target
| metric
Expand All @@ -116,6 +117,7 @@ resource "google_monitoring_alert_policy" "witness_liveness" {
| every 1m
| group_by [metric.witness_id, metric.instanceId], [row_count: row_count()]
| group_by [metric.witness_id], [row_count_mean: mean(row_count)]
| group_by [], [value_witness_aggregate: aggregate(row_count_mean)]
| lt(${var.alert_lt_num_witness_threshold})
EOT
duration = "1800s"
Expand Down

0 comments on commit dcfa03b

Please sign in to comment.