Skip to content

Commit

Permalink
Added basic alerts for receiving updated checkpoints (#112)
Browse files Browse the repository at this point in the history
This adds two alerts for each environment:
 1. that there is a non-zero metric available for update requests
 2. that at least half of updates succeed

These alerts are very rudimentary, but at least set up some basic alerting that we can learn from, and add to.
  • Loading branch information
mhutchinson authored Feb 27, 2024
1 parent 0a0dbd2 commit 735e91d
Show file tree
Hide file tree
Showing 10 changed files with 281 additions and 0 deletions.
22 changes: 22 additions & 0 deletions deployment/live/monitoring/ci/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions deployment/live/monitoring/ci/terragrunt.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
include {
path = find_in_parent_folders()
}

terraform {
source = "${get_path_to_repo_root()}/deployment/modules/monitoring"
}

locals {
common_vars = read_terragrunt_config(find_in_parent_folders())
}

inputs = merge(
local.common_vars.locals,
{
env = "ci"
}
)

22 changes: 22 additions & 0 deletions deployment/live/monitoring/dev/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions deployment/live/monitoring/dev/terragrunt.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
include {
path = find_in_parent_folders()
}

terraform {
source = "${get_path_to_repo_root()}/deployment/modules/monitoring"
}

locals {
common_vars = read_terragrunt_config(find_in_parent_folders())
}

inputs = merge(
local.common_vars.locals,
{
env = "dev"
}
)

22 changes: 22 additions & 0 deletions deployment/live/monitoring/prod/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions deployment/live/monitoring/prod/terragrunt.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
include {
path = find_in_parent_folders()
}

terraform {
source = "${get_path_to_repo_root()}/deployment/modules/monitoring"
}

locals {
common_vars = read_terragrunt_config(find_in_parent_folders())
}

inputs = merge(
local.common_vars.locals,
{
env = "prod"
}
)

20 changes: 20 additions & 0 deletions deployment/live/monitoring/terragrunt.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
locals {
project_id = "checkpoint-distributor"
region = "us-central1"
env = path_relative_to_include()
}

remote_state {
backend = "gcs"

config = {
project = local.project_id
location = local.region
bucket = "${local.project_id}-monitoring-${local.env}-tfstate"
prefix = "${path_relative_to_include()}/terraform.tfstate"

gcs_bucket_labels = {
name = "terraform_state_storage"
}
}
}
91 changes: 91 additions & 0 deletions deployment/modules/monitoring/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

# Project data
provider "google" {
project = var.project_id
}

data "google_project" "project" {
project_id = var.project_id
}

# This will be configured by terragrunt when deploying
terraform {
backend "gcs" {}
required_providers {
google = {
source = "hashicorp/google"
version = "5.14.0"
}
}
}

resource "google_project_service" "monitoring_api" {
service = "monitoring.googleapis.com"
disable_on_destroy = false
}

locals {
distributor_service = "distributor-service-${var.env}"
duration = "5m"
}

resource "google_monitoring_alert_policy" "receiving_updates" {
display_name = "Receiving Updates (${var.env})"
combiner = "OR"
conditions {
display_name = "Requests are present (${var.env})"
condition_prometheus_query_language {
query = <<-EOT
sum(
absent(distributor_update_checkpoint_request{service_name="${local.distributor_service}"})
OR
rate(distributor_update_checkpoint_request{service_name="${local.distributor_service}"}[${local.duration}]) == 0
)
EOT
duration = "1800s"
evaluation_interval = "60s"
}
}

alert_strategy {
auto_close = "1800s"
}
}

resource "google_monitoring_alert_policy" "successful_updates" {
display_name = "Successful Updates (${var.env})"
combiner = "OR"
conditions {
display_name = "Success ratio is healthy (${var.env})"
condition_prometheus_query_language {
query = <<-EOT
sum(
rate(distributor_update_checkpoint_success{service_name="${local.distributor_service}"}[${local.duration}])
/
rate(distributor_update_checkpoint_request{service_name="${local.distributor_service}"}[${local.duration}])
) < 0.5
EOT
duration = "1800s"
evaluation_interval = "60s"
}
}

alert_strategy {
auto_close = "1800s"
}
}
16 changes: 16 additions & 0 deletions deployment/modules/monitoring/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

31 changes: 31 additions & 0 deletions deployment/modules/monitoring/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

variable "project_id" {
description = "The project ID to host the cluster in"
type = string
}

variable "region" {
description = "The region to host the cluster in"
type = string
}

variable "env" {
description = "Unique identifier for the env, e.g. ci or prod"
type = string
}

0 comments on commit 735e91d

Please sign in to comment.