Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: critical alerts by modules - 4 #269

Merged
merged 4 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .env.example.compose
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,3 @@ VALIDATOR_REGISTRY_SOURCE=lido
# Critical alerts (optional).
# CRITICAL_ALERTS_ALERTMANAGER_URL=http://alertmanager:9093
# CRITICAL_ALERTS_MIN_VAL_COUNT=1

# Discord web-hook (optional).
# DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/...
3 changes: 0 additions & 3 deletions .env.example.local
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,3 @@ VALIDATOR_REGISTRY_SOURCE=lido
# Critical alerts (optional).
# CRITICAL_ALERTS_ALERTMANAGER_URL=http://alertmanager:9093
# CRITICAL_ALERTS_MIN_VAL_COUNT=1

# Discord web-hook (optional).
# DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/...
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ services:
deploy:
resources:
limits:
memory: 256m
memory: 512m
volumes:
- ./.volumes/prometheus/:/prometheus
- ./docker/prometheus/:/etc/prometheus/
Expand All @@ -75,7 +75,7 @@ services:
- '8083:8080'

alertmanager:
image: prom/alertmanager:latest
image: prom/alertmanager:v0.24.0
vgorkavenko marked this conversation as resolved.
Show resolved Hide resolved
container_name: alertmanager
restart: unless-stopped
deploy:
Expand Down
24 changes: 12 additions & 12 deletions docker/prometheus/alerts_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@ groups:
annotations:
emoji: 🔪
summary: "Operators have slashed validators"
description: 'Number of slashed validators per operator'
description: 'Number of slashed validators per operator.'
field_name: '{{ $labels.nos_name }}'
field_value: '[{{ $value | printf "%.0f" }}](http://127.0.0.1:8082/d/3wimU2H7h/nodeoperators/?var-nos_name_var={{ urlquery $labels.nos_name }}&from={{ with query "(time() - 1200) * 1000" }}{{ . | first | value | printf "%f" }}{{ end }}&to={{ with query "time() * 1000" }}{{ . | first | value | printf "%f" }}{{ end }})'
url: "http://127.0.0.1:8082/d/HRgPmpNnz/validators"
footer_text: 'Epoch • {{ with query "ethereum_validators_monitoring_epoch_number" }}{{ . | first | value | printf "%.0f" }}{{ end }}'
footer_icon_url: "https://cryptologos.cc/logos/steth-steth-logo.png"

- alert: DataActuality
expr: absent(ethereum_validators_monitoring_data_actuality) OR (ethereum_validators_monitoring_data_actuality / 1000 > 3600)
expr: ethereum_validators_monitoring_data_actuality > 3600000 OR absent(ethereum_validators_monitoring_data_actuality)
for: 1m
labels:
severity: critical
annotations:
emoji: ⏳
summary: "Data actuality greater then 1 hour"
resolved_summary: "Data actuality is back to normal and now less then 1 hour"
description: "({{ humanizeDuration $value }}) It's not OK. Please, check app health"
resolved_description: "It's OK"
resolved_summary: "Data actuality is back to normal and now less then 1 hour."
description: "({{ humanizeDuration $value }}) It's not OK. Please, check app health."
resolved_description: "It's OK."
url: "http://127.0.0.1:8082/d/HRgPmpNnz/validators"
footer_text: 'Epoch • {{ with query "ethereum_validators_monitoring_epoch_number" }}{{ . | first | value | printf "%.0f" }}{{ end }}'
footer_icon_url: "https://cryptologos.cc/logos/steth-steth-logo.png"
Expand All @@ -38,7 +38,7 @@ groups:
annotations:
emoji: 💸
summary: 'Operators have a negative balance delta'
resolved_summary: 'Operators have a positive balance delta'
resolved_summary: 'Operators have a positive balance delta.'
description: 'Number of validators per operator who have a negative balance delta.'
resolved_description: 'Number of validators per operator who recovered.'
field_name: '{{ $labels.nos_name }}'
Expand All @@ -54,7 +54,7 @@ groups:
annotations:
emoji: 📝❌
summary: 'Operators have missed attestation in last {{ $labels.epoch_interval }} finalized epochs'
resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs'
resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs.'
description: 'Number of validators per operator who have missed attestations.'
resolved_description: 'Number of validators per operator who recovered.'
field_name: '{{ $labels.nos_name }}'
Expand Down Expand Up @@ -98,7 +98,7 @@ groups:
annotations:
emoji: 📥
summary: 'Operators missed block propose in the last finalized epoch'
resolved_summary: 'Operators not missed block propose in the last finalized epoch'
resolved_summary: 'Operators not missed block propose in the last finalized epoch.'
description: 'Number of validators per operator who missed block propose.'
resolved_description: 'Number of validators per operator who recovered.'
field_name: '{{ $labels.nos_name }}'
Expand All @@ -114,7 +114,7 @@ groups:
annotations:
emoji: 🔄
summary: 'Operators sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs'
resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epochs'
resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epochs.'
description: 'Number of validators per operator whose sync participation less than average.'
resolved_description: 'Number of validators per operator who recovered.'
field_name: '{{ $labels.nos_name }}'
Expand All @@ -129,7 +129,7 @@ groups:
severity: critical
annotations:
emoji: '📈🔄'
summary: 'Operators may get high rewards in the future, but sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs!'
summary: 'Operators may get high rewards in the future, but sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs'
resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epoch. Now may get high rewards in the future!'
description: 'Number of validators per operator whose sync participation less than average.'
resolved_description: 'Number of validators per operator who recovered.'
Expand All @@ -145,7 +145,7 @@ groups:
severity: critical
annotations:
emoji: '📈📝❌'
summary: 'Operators may get high rewards in the future, but missed attestation in last {{ $labels.epoch_interval }} finalized epochs!'
summary: 'Operators may get high rewards in the future, but missed attestation in last {{ $labels.epoch_interval }} finalized epochs'
resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs. Now may get high rewards in the future!'
description: 'Number of validators per operator who have missed attestations.'
resolved_description: 'Number of validators per operator who recovered.'
Expand All @@ -161,7 +161,7 @@ groups:
severity: critical
annotations:
emoji: '📈📥'
summary: 'Operators may get high rewards in the future, but missed block propose in the last finalized epoch!'
summary: 'Operators may get high rewards in the future, but missed block propose in the last finalized epoch'
resolved_summary: 'Operators not missed block propose in the last finalized epoch. Now may get high rewards in the future!'
description: 'Number of validators per operator who missed block propose.'
resolved_description: 'Number of validators per operator who recovered.'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export class CriticalMissedAttestations extends Alert {
labels: {
alertname: this.alertname,
severity: 'critical',
nos_module_id: this.moduleIndex,
nos_module_id: this.moduleIndex.toString(),
vgorkavenko marked this conversation as resolved.
Show resolved Hide resolved
...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'),
},
annotations: {
Expand Down
2 changes: 1 addition & 1 deletion src/common/alertmanager/alerts/CriticalMissedProposes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ export class CriticalMissedProposes extends Alert {
labels: {
alertname: this.alertname,
severity: 'critical',
nos_module_id: this.moduleIndex,
nos_module_id: this.moduleIndex.toString(),
...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'),
},
annotations: {
Expand Down
2 changes: 1 addition & 1 deletion src/common/alertmanager/alerts/CriticalNegativeDelta.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export class CriticalNegativeDelta extends Alert {
labels: {
alertname: this.alertname,
severity: 'critical',
nos_module_id: this.moduleIndex,
nos_module_id: this.moduleIndex.toString(),
...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'),
},
annotations: {
Expand Down
2 changes: 1 addition & 1 deletion src/common/alertmanager/alerts/CriticalSlashing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ export class CriticalSlashing extends Alert {
labels: {
alertname: this.alertname,
severity: 'critical',
nos_module_id: this.moduleIndex,
nos_module_id: this.moduleIndex.toString(),
...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'),
},
annotations: {
Expand Down