From c5253ab145c0dc5a845e33f4611788b142afec80 Mon Sep 17 00:00:00 2001 From: Alexander Lukin Date: Thu, 23 Jan 2025 01:48:23 +0400 Subject: [PATCH 1/3] fix: convert numeric labels to strings 1. Alertmanager failed to handle the numeric `nos_module_id` label correctly. Because of this fact, Discord alerts with this label were not sent and the app returned the Alertmanager error. Now this bug is fixed. 2. The app failed to work correctly with the latest version of the Alertmanager. Now the particular version of the Alertmanager is fixed in the `docker-compose` file so that the app can always start all its containers correctly. 3. Increase memory required for the Prometheus service in the docker-compose, so that the Prometheus service can correctly handle a larger number of node operators. 4. Remove the `DISCORD_WEBHOOK_URL` env variable from env examples as we don't have such a variable in the EVM app. 5. Slightly harmonize the content of alert labels. --- .env.example.compose | 3 --- .env.example.local | 3 --- docker-compose.yml | 4 ++-- docker/prometheus/alerts_rules.yml | 24 +++++++++---------- .../alerts/CriticalMissedAttestations.ts | 2 +- .../alerts/CriticalMissedProposes.ts | 2 +- .../alerts/CriticalNegativeDelta.ts | 2 +- .../alertmanager/alerts/CriticalSlashing.ts | 2 +- 8 files changed, 18 insertions(+), 24 deletions(-) diff --git a/.env.example.compose b/.env.example.compose index a7f7417c..adc95b69 100644 --- a/.env.example.compose +++ b/.env.example.compose @@ -27,6 +27,3 @@ VALIDATOR_REGISTRY_SOURCE=lido # Critical alerts (optional). # CRITICAL_ALERTS_ALERTMANAGER_URL=http://alertmanager:9093 # CRITICAL_ALERTS_MIN_VAL_COUNT=1 - -# Discord web-hook (optional). -# DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/... diff --git a/.env.example.local b/.env.example.local index b5666484..7bafb0be 100644 --- a/.env.example.local +++ b/.env.example.local @@ -32,6 +32,3 @@ VALIDATOR_REGISTRY_SOURCE=lido # Critical alerts (optional). # CRITICAL_ALERTS_ALERTMANAGER_URL=http://alertmanager:9093 # CRITICAL_ALERTS_MIN_VAL_COUNT=1 - -# Discord web-hook (optional). -# DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/... diff --git a/docker-compose.yml b/docker-compose.yml index e8574bbf..df4ee3d4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,7 +54,7 @@ services: deploy: resources: limits: - memory: 256m + memory: 512m volumes: - ./.volumes/prometheus/:/prometheus - ./docker/prometheus/:/etc/prometheus/ @@ -75,7 +75,7 @@ services: - '8083:8080' alertmanager: - image: prom/alertmanager:latest + image: prom/alertmanager:v0.24.0 container_name: alertmanager restart: unless-stopped deploy: diff --git a/docker/prometheus/alerts_rules.yml b/docker/prometheus/alerts_rules.yml index 908eebcc..e7b257d3 100644 --- a/docker/prometheus/alerts_rules.yml +++ b/docker/prometheus/alerts_rules.yml @@ -9,7 +9,7 @@ groups: annotations: emoji: 🔪 summary: "Operators have slashed validators" - description: 'Number of slashed validators per operator' + description: 'Number of slashed validators per operator.' field_name: '{{ $labels.nos_name }}' field_value: '[{{ $value | printf "%.0f" }}](http://127.0.0.1:8082/d/3wimU2H7h/nodeoperators/?var-nos_name_var={{ urlquery $labels.nos_name }}&from={{ with query "(time() - 1200) * 1000" }}{{ . | first | value | printf "%f" }}{{ end }}&to={{ with query "time() * 1000" }}{{ . | first | value | printf "%f" }}{{ end }})' url: "http://127.0.0.1:8082/d/HRgPmpNnz/validators" @@ -17,16 +17,16 @@ groups: footer_icon_url: "https://cryptologos.cc/logos/steth-steth-logo.png" - alert: DataActuality - expr: absent(ethereum_validators_monitoring_data_actuality) OR (ethereum_validators_monitoring_data_actuality / 1000 > 3600) + expr: ethereum_validators_monitoring_data_actuality > 3600000 OR absent(ethereum_validators_monitoring_data_actuality) for: 1m labels: severity: critical annotations: emoji: ⏳ summary: "Data actuality greater then 1 hour" - resolved_summary: "Data actuality is back to normal and now less then 1 hour" - description: "({{ humanizeDuration $value }}) It's not OK. Please, check app health" - resolved_description: "It's OK" + resolved_summary: "Data actuality is back to normal and now less then 1 hour." + description: "({{ humanizeDuration $value }}) It's not OK. Please, check app health." + resolved_description: "It's OK." url: "http://127.0.0.1:8082/d/HRgPmpNnz/validators" footer_text: 'Epoch • {{ with query "ethereum_validators_monitoring_epoch_number" }}{{ . | first | value | printf "%.0f" }}{{ end }}' footer_icon_url: "https://cryptologos.cc/logos/steth-steth-logo.png" @@ -38,7 +38,7 @@ groups: annotations: emoji: 💸 summary: 'Operators have a negative balance delta' - resolved_summary: 'Operators have a positive balance delta' + resolved_summary: 'Operators have a positive balance delta.' description: 'Number of validators per operator who have a negative balance delta.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -54,7 +54,7 @@ groups: annotations: emoji: 📝❌ summary: 'Operators have missed attestation in last {{ $labels.epoch_interval }} finalized epochs' - resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs' + resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs.' description: 'Number of validators per operator who have missed attestations.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -98,7 +98,7 @@ groups: annotations: emoji: 📥 summary: 'Operators missed block propose in the last finalized epoch' - resolved_summary: 'Operators not missed block propose in the last finalized epoch' + resolved_summary: 'Operators not missed block propose in the last finalized epoch.' description: 'Number of validators per operator who missed block propose.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -114,7 +114,7 @@ groups: annotations: emoji: 🔄 summary: 'Operators sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs' - resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epochs' + resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epochs.' description: 'Number of validators per operator whose sync participation less than average.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -129,7 +129,7 @@ groups: severity: critical annotations: emoji: '📈🔄' - summary: 'Operators may get high rewards in the future, but sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs!' + summary: 'Operators may get high rewards in the future, but sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs' resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epoch. Now may get high rewards in the future!' description: 'Number of validators per operator whose sync participation less than average.' resolved_description: 'Number of validators per operator who recovered.' @@ -145,7 +145,7 @@ groups: severity: critical annotations: emoji: '📈📝❌' - summary: 'Operators may get high rewards in the future, but missed attestation in last {{ $labels.epoch_interval }} finalized epochs!' + summary: 'Operators may get high rewards in the future, but missed attestation in last {{ $labels.epoch_interval }} finalized epochs' resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs. Now may get high rewards in the future!' description: 'Number of validators per operator who have missed attestations.' resolved_description: 'Number of validators per operator who recovered.' @@ -161,7 +161,7 @@ groups: severity: critical annotations: emoji: '📈📥' - summary: 'Operators may get high rewards in the future, but missed block propose in the last finalized epoch!' + summary: 'Operators may get high rewards in the future, but missed block propose in the last finalized epoch' resolved_summary: 'Operators not missed block propose in the last finalized epoch. Now may get high rewards in the future!' description: 'Number of validators per operator who missed block propose.' resolved_description: 'Number of validators per operator who recovered.' diff --git a/src/common/alertmanager/alerts/CriticalMissedAttestations.ts b/src/common/alertmanager/alerts/CriticalMissedAttestations.ts index cf1e2974..d1ee26a3 100644 --- a/src/common/alertmanager/alerts/CriticalMissedAttestations.ts +++ b/src/common/alertmanager/alerts/CriticalMissedAttestations.ts @@ -85,7 +85,7 @@ export class CriticalMissedAttestations extends Alert { labels: { alertname: this.alertname, severity: 'critical', - nos_module_id: this.moduleIndex, + nos_module_id: this.moduleIndex.toString(), ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), }, annotations: { diff --git a/src/common/alertmanager/alerts/CriticalMissedProposes.ts b/src/common/alertmanager/alerts/CriticalMissedProposes.ts index 8669a716..e92cf4ca 100644 --- a/src/common/alertmanager/alerts/CriticalMissedProposes.ts +++ b/src/common/alertmanager/alerts/CriticalMissedProposes.ts @@ -78,7 +78,7 @@ export class CriticalMissedProposes extends Alert { labels: { alertname: this.alertname, severity: 'critical', - nos_module_id: this.moduleIndex, + nos_module_id: this.moduleIndex.toString(), ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), }, annotations: { diff --git a/src/common/alertmanager/alerts/CriticalNegativeDelta.ts b/src/common/alertmanager/alerts/CriticalNegativeDelta.ts index fb371d21..35c8da25 100644 --- a/src/common/alertmanager/alerts/CriticalNegativeDelta.ts +++ b/src/common/alertmanager/alerts/CriticalNegativeDelta.ts @@ -85,7 +85,7 @@ export class CriticalNegativeDelta extends Alert { labels: { alertname: this.alertname, severity: 'critical', - nos_module_id: this.moduleIndex, + nos_module_id: this.moduleIndex.toString(), ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), }, annotations: { diff --git a/src/common/alertmanager/alerts/CriticalSlashing.ts b/src/common/alertmanager/alerts/CriticalSlashing.ts index d9dd0aef..006e1687 100644 --- a/src/common/alertmanager/alerts/CriticalSlashing.ts +++ b/src/common/alertmanager/alerts/CriticalSlashing.ts @@ -56,7 +56,7 @@ export class CriticalSlashing extends Alert { labels: { alertname: this.alertname, severity: 'critical', - nos_module_id: this.moduleIndex, + nos_module_id: this.moduleIndex.toString(), ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), }, annotations: { From 6918bc4babd9c4cdf1027a2d529b2d16ab659043 Mon Sep 17 00:00:00 2001 From: Alexander Lukin Date: Thu, 23 Jan 2025 02:18:19 +0400 Subject: [PATCH 2/3] Reapply "feat: critical alerts by modules - 2" This reverts commit 3e718674a7fe4c56c9d2fc42dd41149270c3a3d5. --- README.md | 184 ++++++++++++++++-- docker/validators/custom_mainnet.yaml | 27 ++- docker/validators/custom_testnet.yaml | 27 ++- src/common/alertmanager/alerts/BasicAlert.ts | 23 ++- .../alerts/CriticalMissedAttestations.ts | 64 ++++-- .../alerts/CriticalMissedProposes.ts | 60 ++++-- .../alerts/CriticalNegativeDelta.ts | 72 +++++-- .../alertmanager/alerts/CriticalSlashing.ts | 49 +++-- .../alertmanager/critical-alerts.service.ts | 56 ++++-- src/common/config/config.service.ts | 40 ++++ src/common/config/env.validation.ts | 9 + .../interfaces/environment.interface.ts | 9 + .../consensus-provider.service.ts | 6 +- src/common/functions/urljoin.ts | 6 +- src/inspector/inspector.service.ts | 4 +- .../file-source/file-source.service.ts | 29 ++- .../keysapi-source/keysapi-source.service.ts | 4 + .../lido-source/lido-source.service.ts | 4 + .../registry-source.interface.ts | 1 + src/validators-registry/registry.service.ts | 6 + 20 files changed, 550 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index 15b41bcc..0b2bbf38 100644 --- a/README.md +++ b/README.md @@ -262,7 +262,8 @@ Holesky) this value should be omitted. * **Default:** ./docker/validators/lido_mainnet.db * **Note:** it makes sense to change default value if `VALIDATOR_REGISTRY_SOURCE` is set to "lido" --- -`VALIDATOR_REGISTRY_KEYSAPI_SOURCE_URLS` - Comma-separated list of URLs to [Lido Keys API service](https://github.com/lidofinance/lido-keys-api). +`VALIDATOR_REGISTRY_KEYSAPI_SOURCE_URLS` - Comma-separated list of URLs to +[Lido Keys API service](https://github.com/lidofinance/lido-keys-api). * **Required:** false * **Note:** will be used only if `VALIDATOR_REGISTRY_SOURCE` is set to "keysapi" --- @@ -278,21 +279,25 @@ Holesky) this value should be omitted. * **Required:** false * **Default:** 2 --- -`VALIDATOR_USE_STUCK_KEYS_FILE` - Use a file with list of validators that are stuck and should be excluded from the monitoring metrics. +`VALIDATOR_USE_STUCK_KEYS_FILE` - Use a file with list of validators that are stuck and should be excluded from the +monitoring metrics. * **Required:** false * **Values:** true / false * **Default:** false --- -`VALIDATOR_STUCK_KEYS_FILE_PATH` - Path to file with list of validators that are stuck and should be excluded from the monitoring metrics. +`VALIDATOR_STUCK_KEYS_FILE_PATH` - Path to file with list of validators that are stuck and should be excluded from the +monitoring metrics. * **Required:** false * **Default:** ./docker/validators/stuck_keys.yaml * **Note:** will be used only if `VALIDATOR_USE_STUCK_KEYS_FILE` is true --- -`SYNC_PARTICIPATION_DISTANCE_DOWN_FROM_CHAIN_AVG` - Distance (down) from Blockchain Sync Participation average after which we think that our sync participation is bad. +`SYNC_PARTICIPATION_DISTANCE_DOWN_FROM_CHAIN_AVG` - Distance (down) from Blockchain Sync Participation average after +which we think that our sync participation is bad. * **Required:** false * **Default:** 0 --- -`SYNC_PARTICIPATION_EPOCHS_LESS_THAN_CHAIN_AVG` - Number epochs after which we think that our sync participation is bad and alert about that. +`SYNC_PARTICIPATION_EPOCHS_LESS_THAN_CHAIN_AVG` - Number epochs after which we think that our sync participation is bad +and alert about that. * **Required:** false * **Default:** 3 --- @@ -300,33 +305,180 @@ Holesky) this value should be omitted. * **Required:** false * **Default:** 3 --- -`CRITICAL_ALERTS_ALERTMANAGER_URL` - If passed, application sends additional critical alerts about validators performance to Alertmanager. +`CRITICAL_ALERTS_ALERTMANAGER_URL` - If passed, application sends additional critical alerts about validators +performance to Alertmanager. * **Required:** false --- -`CRITICAL_ALERTS_MIN_VAL_COUNT` - Critical alerts will be sent for Node Operators with validators count greater this value. +`CRITICAL_ALERTS_MIN_VAL_COUNT` - Critical alerts will be sent for Node Operators with validators count greater or equal +to this value. * **Required:** false * **Default:** 100 --- +`CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` - Sets the minimum conditions for triggering critical alerts based on the number +of active validators for node operators in a specific module. + +The value must be in JSON format. Example: +`{ "0": { "minActiveCount": 100, "affectedShare": 0.33, "minAffectedCount": 1000 } }`. + +The numeric key represents the module ID. Settings under the `0` key apply to all modules unless overridden by settings +for specific module IDs. Settings for specific module IDs take precedence over the `0` key. + +A critical alert is sent if: + +* The number of active validators for a node operator meets or exceeds `minActiveCount`. +* The number of affected validators: + * Is at least `affectedShare` of the total validators for the node operator, OR + * Exceeds or equal to `minAffectedCount`. +* Value in the `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` for specific module is not overridden by + `CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`. + +If no settings are provided for a specific module or the 0 key, default values are used: +`{ "minActiveCount": CRITICAL_ALERTS_MIN_VAL_COUNT, "affectedShare": 0.33, "minAffectedCount": 1000 }`. +* **Required:** false +* **Default:** {} +--- +`CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT` - Defines the minimum number of affected validators for a node operator in a +specific module for which a critical alert should be sent. + +The value must be in JSON format, for example: `{ "0": 100, "3": 50 }`. The numeric key represents the module ID. The +value for the key `0` applies to all modules. Values for non-zero keys apply only to the specified module and take +precedence over the `0` key. + +This variable takes priority over `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` and `CRITICAL_ALERTS_MIN_VAL_COUNT`. If no +value is set for a specific module or the `0` key, the rules from the other two variables will apply instead. +* **Required:** false +* **Default:** {} +--- `CRITICAL_ALERTS_ALERTMANAGER_LABELS` - Additional labels for critical alerts. -Must be in JSON string format. Example - '{"a":"valueA","b":"valueB"}'. +Must be in JSON string format. Example: `{ "a": "valueA", "b": "valueB" }`. * **Required:** false * **Default:** {} --- ## Application critical alerts (via Alertmanager) -In addition to alerts based on Prometheus metrics you can receive special critical alerts based on beaconchain aggregates from app. +In addition to alerts based on Prometheus metrics you can receive special critical alerts based on Beacon Chain +aggregates from app. You should pass env var `CRITICAL_ALERTS_ALERTMANAGER_URL=http://:`. -And if `ethereum_validators_monitoring_data_actuality < 1h` it allows you to receive alerts from table bellow +Critical alerts for modules are controlled by three environment variables, listed here with their priority (from lowest +to highest): +``` +CRITICAL_ALERTS_MIN_VAL_COUNT: number; +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT: { + : { + minActiveCount: number, + affectedShare: number, + minAffectedCount: number, + } +}; +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT: { + : number +}; +``` + +The following rules are applied (listed in order of increasing priority, the next rule overrides the previous one). -| Alert name | Description | If fired repeat | If value increased repeat | -|----------------------------|-----------------------------------------------------------------------------------------------------------------|-----------------|---------------------------| -| CriticalSlashing | At least one validator was slashed | instant | - | -| CriticalMissedProposes | More than 1/3 blocks from Node Operator duties was missed in the last 12 hours | every 6h | - | -| CriticalNegativeDelta | More than 1/3 or more than 1000 Node Operator validators with negative balance delta (between current and 6 epochs ago) | every 6h | every 1h | -| CriticalMissedAttestations | More than 1/3 or more than 1000 Node Operator validators with missed attestations in the last {{ BAD_ATTESTATION_EPOCHS }} epochs | every 6h | every 1h | +1. **Global Fallback** (`CRITICAL_ALERTS_MIN_VAL_COUNT`). If this variable is set, it acts as a default for modules by + creating an implicit rule: +``` +{ + "0": { + "minActiveCount": CRITICAL_ALERTS_MIN_VAL_COUNT, + "affectedShare": 0.33, + "minAffectedCount": 1000 + } +} +``` + +2. **Global Rules for Active Validators** (`CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT`). Default rules apply to all modules + (key `0`) unless overridden. +``` +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = { + "0": { + "minActiveCount": , + "affectedShare": <0.xx>, + "minAffectedCount": , + } +} +``` +A critical alert is triggered for a module if **both** conditions are met: +* Active validators exceed or equal to `minActiveCount`. +* Affected validators exceed or equal to either `minAffectedCount` or `affectedShare` of the total active validators. + +3. **Global Rules for Affected Validators** (`CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`). Default rules apply to all + modules (key `0`) unless overridden. +``` +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = { + "0": +} +``` +A critical alert is triggered if the number of affected validators exceeds or equal to this value. + +4. **Per-Module Rules for Active Validators** (`CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT`). If specific module keys are + defined, those values override the global rules for `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` and + `CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`. +``` +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = { + "n": { + "minActiveCount": , + "affectedShare": <0.xx>, + "minAffectedCount": , + } +} +``` +A critical alert is triggered for those modules if **both** conditions are met: + +* Active validators exceed or equal to `minActiveCount`. +* Affected validators exceed or equal either `minAffectedCount` or `affectedShare` of the total validators. + +For modules that don't have keys in the `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` the rules defined in the previous steps +are applied. + +5. **Per-Module Rules for Affected Validators** (`CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`). If specific module keys are + defined, those values override all other rules for the module. +``` +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = { + "n": +} +``` +A critical alert is triggered if the number of affected validators exceeds or equal to the specified value. + +To illustrate these rules let's consider the following sample config: +``` +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = { + "0": { + "minActiveCount": 100, + "affectedShare": 0.3, + "minAffectedCount": 1000, + }, + "3": { + "minActiveCount": 10, + "affectedShare": 0.5, + "minAffectedCount": 200, + }, +}; +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = { + "2": 30 +}; +``` +In this case, critical alerts for any modules except 2 and 3 will be triggered for operators with at least 100 active +validators and only if either at least 1000 or 30% of active validators are affected by a critical alert (depending on +what number is less). However, for operators from the 3-rd module, these rules are weakened: a critical alert will be +triggered for operators with at least 10 active validators and only if either 200 or 50% of validators are affected. + +These rules are not applied to the 2-nd module. For this module, critical alerts will be triggered for all operators +with at least 30 affected validators (no matter how many active validators they have). + +If `ethereum_validators_monitoring_data_actuality < 1h` alerts from table bellow are sent. + +| Alert name | Description | If fired repeat | If value increased repeat | +|----------------------------|---------------------------------------------------------------------------------------------------------|-----------------|---------------------------| +| CriticalSlashing | At least one validator was slashed | instant | - | +| CriticalMissedProposes | More than 1/3 blocks from Node Operator duties was missed in the last 12 hours | every 6h | - | +| CriticalNegativeDelta | A certain number of validators with negative balance delta (between current and 6 epochs ago) | every 6h | every 1h | +| CriticalMissedAttestations | A certain number of validators with missed attestations in the last `{{BAD_ATTESTATION_EPOCHS}}` epochs | every 6h | every 1h | ## Application metrics diff --git a/docker/validators/custom_mainnet.yaml b/docker/validators/custom_mainnet.yaml index 5dc7c223..26e08d3f 100644 --- a/docker/validators/custom_mainnet.yaml +++ b/docker/validators/custom_mainnet.yaml @@ -1,9 +1,22 @@ -operators: - - name: Operator1 +module_1: + - name: Operator_1_0 keys: - - "0xa8088b23b6e9eaecb04c7dfd194d9e47df966605a1cf03004b7d671708421da4cb2836447f73a5f25c2cfb567b181f80" - - "0x84f6ffe8d2285b76d5076165cec8b298c8ed3dc123379de8d49ecf2e27137ebe479fec0e667322a450283c990bfe9995" - - name: Operator2 + - "0x800429af2ff9e4581b3a800cec1604de49538a50659c0cbb2b79493b5d888b2b2075f9e7163bc11024088b17c2b78107" + - "0x8004a4ddb445add99be6e41fce54ae0ceba0d802817585c900e3b43d2a35ab09a8b451d02592fa105249af07122887b8" + - name: Operator_1_1 keys: - - "0xa015a5fcd78cb52e2b1f9c1a833868f9da8dfee31c919e8e1c19aa64defdd140390a16d133b500d5a90bc99bca409908" - - "0xb9b74aaec50f74e484862b5b6bf0174ffa7344f2de2b1b89aeb233722d4bc9812ee346d99a6b0740e2c14c1580257247" + - "0x8004d6da4e9228cb0efbf383ce259338d5626029e3f80913ad1c89098d3289977ba10d873cf88c61e1b2572e26fbd318" + - "0x800532e962039d57e63d1da433e26f6bbff8b15f07b90deb5be8038b7f24ddb2d71d2b26a1693a7fb9a7657f3b8b5fef" + +# Optional +module_2: + - name: Operator_2_0 + keys: + - "0x80081580eefc89c95874ca868cb439a0c51b4b6f97483632ea597e4801c47f03a8f45360a44411c2320296c737c89bc6" + - "0x8008b169609ee48ef4bd36c37bb2d0c5f9fe0335f28396d5aa8620409912e16c06b4ae2048542492007a2005928b074c" + - "0x800e4b8fa424ff35feef522592f3e711a46b426320a7dc40044fb02537e0faf25566e47c72172a3020d0c6bc1648ecc8" + - name: Operator_2_1 + keys: + - "0x80096ff18d55b9b08c1778568867210d9110f5a2200962a962846d09a75bfa29177c42b83903ed0cb0b69f8a061e3e11" + - "0x800c8cb0fcd6104cbdf76120352c1651e858eef2fad8142ebca37d26f76a16c5f692f9b987bb22dd6eb5dd0dc9e021a4" + - "0x800cd7cf64998da8d95ac0e864012922904b78cccc28f2fa88f3bf019ecc8779833d1c7e09d62700b14d2b015f002a52" diff --git a/docker/validators/custom_testnet.yaml b/docker/validators/custom_testnet.yaml index c85b729d..37cf2b60 100644 --- a/docker/validators/custom_testnet.yaml +++ b/docker/validators/custom_testnet.yaml @@ -1,9 +1,22 @@ -operators: - - name: Operator1 +module_1: + - name: Operator_1_0 keys: - - "0xafbf5b06e7953b095a9946cc7ee8f2ecf1312878bd196af4d06661bc7718f1ae2d5c9f8b635f5924bf5d8266234607f8" - - "0x925c1f368524be3fa83c52f40151724b38fb4ebfe64f64f70942aa9a307b81843d9514c1d8f3c8689236f0f1ccd6c6d4" - - name: Operator2 + - "0x8000011bc03bbf99ac5964d14d3bb52de983c848cc3734d736235a19715e8cbbd5e963163eb4bd2d8cd473d103b95c12" + - "0x80000b1388d41e2cb346e6a85d94fccc6510a11d5bd91699e156907b53e1f5c265effa87f492b7cba7fe218f232c6c39" + - name: Operator_1_1 keys: - - "0xb5b9b79942fcce7ddd2c3b00dae34e571fb77f0630d4fdeccba3721b6549013b55cbfe643d96cbe920864795c5f01db6" - - "0xb3ddd2b56dbf80ba035d948709099f8ad7241929a051140ce2698fae216293d98c792314c414afb0ed3b849323b523c6" + - "0x800010c6cde9a31d218347c9d042ceff227a1dbec3970336bd8cd6d767fd0f2e587332ef6a3010b1b0f5d04288483d44" + - "0x80001887f6c44f54e043866a6536b940f1c2bdf0a99203f217940fab8684e77fa1c9cc64537464d7d2b681115eec446a" + +# Optional +module_2: + - name: Operator_2_0 + keys: + - "0x80002248327da011001f38ab78e277ed5ddc1448078a1ba3f1cb47fd20f65f6de07808d7c3c96a2a795011b25100cc1d" + - "0x800037d7c5468fb960d7e5cb40c2d9c39d6713676d9bc971e92692759ac7ba5b0f12d034282e0cfd4cf2c1212d38dd2a" + - "0x80003ad67e896cb261a17398e77e474a7ffc7898a40cf004a74ea8d20b2b562ac7906a3a62656bfbc1d3033748cdd972" + - name: Operator_2_1 + keys: + - "0x80004546cdf353788bd0fb2048c80ecaae4dbd72ed1b9e51d90c0457d57f5e3577778a9710f267aa1e50ce0d5df6fa28" + - "0x80008083f7eb1366eaef3992c48e0ced5dadef0e4405c7b9a0a662322847f98022d970e6a13cf12da9d199b7518562f7" + - "0x80009e291a1e81be05ffce78180bb0a240242466af9613ef8dd34a8f1289f9b9dfc2c98c5d40be4d61f1eb4dec559217" diff --git a/src/common/alertmanager/alerts/BasicAlert.ts b/src/common/alertmanager/alerts/BasicAlert.ts index 7299518f..f82c68db 100644 --- a/src/common/alertmanager/alerts/BasicAlert.ts +++ b/src/common/alertmanager/alerts/BasicAlert.ts @@ -1,6 +1,6 @@ import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsStatusStats } from 'storage/clickhouse/clickhouse.types'; import { RegistrySourceOperator } from 'validators-registry'; export interface AlertRequestBody { @@ -26,22 +26,33 @@ export abstract class Alert { protected readonly config: ConfigService; protected readonly storage: ClickhouseService; protected readonly operators: RegistrySourceOperator[]; - - protected constructor(name: string, config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { + protected readonly moduleIndex: number; + protected readonly nosStats: NOsValidatorsStatusStats[]; + + protected constructor( + name: string, + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + ) { this.alertname = name; this.config = config; this.storage = storage; this.operators = operators; + this.moduleIndex = moduleIndex; + this.nosStats = nosStats; } - abstract alertRule(bySlot: number): Promise; + abstract alertRule(): AlertRuleResult; abstract sendRule(ruleResult?: AlertRuleResult): boolean; abstract alertBody(ruleResult: AlertRuleResult): AlertRequestBody; - async toSend(epoch: Epoch): Promise { - const ruleResult = await this.alertRule(epoch); + async toSend(): Promise { + const ruleResult = await this.alertRule(); if (this.sendRule(ruleResult)) return { timestamp: this.sendTimestamp, body: this.alertBody(ruleResult), ruleResult }; } } diff --git a/src/common/alertmanager/alerts/CriticalMissedAttestations.ts b/src/common/alertmanager/alerts/CriticalMissedAttestations.ts index 0700e378..cf1e2974 100644 --- a/src/common/alertmanager/alerts/CriticalMissedAttestations.ts +++ b/src/common/alertmanager/alerts/CriticalMissedAttestations.ts @@ -2,35 +2,57 @@ import { join } from 'lodash'; import { sentAlerts } from 'common/alertmanager'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsByConditionAttestationCount, NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; -const validatorsWithMissedAttestationCountThreshold = (quantity: number) => { - return Math.min(quantity / 3, 1000); -}; - export class CriticalMissedAttestations extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalMissedAttestations.name, config, storage, operators); + protected readonly missedAttValidatorsCount: NOsValidatorsByConditionAttestationCount[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + missedAttValidatorsCount: NOsValidatorsByConditionAttestationCount[], + ) { + const name = CriticalMissedAttestations.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.missedAttValidatorsCount = missedAttValidatorsCount; } - async alertRule(epoch: Epoch): Promise { + alertRule(): AlertRuleResult { + const alertParams = this.config.getCriticalAlertParamForModule(this.moduleIndex); const result: AlertRuleResult = {}; - const nosStats = await this.storage.getUserNodeOperatorsStats(epoch); - const missedAttValidatorsCount = await this.storage.getValidatorCountWithMissedAttestationsLastNEpoch(epoch); - for (const noStats of nosStats.filter((o) => o.active_ongoing > this.config.get('CRITICAL_ALERTS_MIN_VAL_COUNT'))) { - const operator = this.operators.find((o) => +noStats.val_nos_module_id == o.module && +noStats.val_nos_id == o.index); - const missedAtt = missedAttValidatorsCount.find( - (a) => a.val_nos_id != null && +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index, + + const activeOngoingThreshold = alertParams.affectedValCount ?? alertParams.activeValCount.minActiveCount; + + // If affectedValCount is set, we're not interested in NOs with a number of validators less than this value + // (because for these NOs it is not possible to have a number of affected validators greater than this value). + const filteredNosStats = this.nosStats.filter((o) => o.active_ongoing >= activeOngoingThreshold); + + for (const noStats of filteredNosStats) { + const operator = this.operators.find((o) => +noStats.val_nos_id === o.index); + const missedAtt = this.missedAttValidatorsCount.find( + (a) => a.val_nos_id != null && +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index, ); - if (!missedAtt) continue; - if (missedAtt.amount > validatorsWithMissedAttestationCountThreshold(noStats.active_ongoing)) { + + if (missedAtt == null) continue; + + const includeToResult = + alertParams.affectedValCount != null + ? missedAtt.amount >= alertParams.affectedValCount + : missedAtt.amount >= + Math.min(noStats.active_ongoing * alertParams.activeValCount.affectedShare, alertParams.activeValCount.minAffectedCount); + if (includeToResult) { result[operator.name] = { ongoing: noStats.active_ongoing, missedAtt: missedAtt.amount }; } } + return result; } @@ -54,12 +76,16 @@ export class CriticalMissedAttestations extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), labels: { alertname: this.alertname, severity: 'critical', + nos_module_id: this.moduleIndex, ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), }, annotations: { @@ -67,7 +93,7 @@ export class CriticalMissedAttestations extends Alert { Object.values(ruleResult).length } Node Operators with CRITICAL count of validators with missed attestations in the last ${this.config.get( 'BAD_ATTESTATION_EPOCHS', - )} epoch`, + )} epoch in module ${this.moduleIndex}`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.missedAtt} of ${r.ongoing}`), '\n', diff --git a/src/common/alertmanager/alerts/CriticalMissedProposes.ts b/src/common/alertmanager/alerts/CriticalMissedProposes.ts index 73682c4d..8669a716 100644 --- a/src/common/alertmanager/alerts/CriticalMissedProposes.ts +++ b/src/common/alertmanager/alerts/CriticalMissedProposes.ts @@ -2,8 +2,8 @@ import { join } from 'lodash'; import { sentAlerts } from 'common/alertmanager'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsProposesStats, NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; @@ -11,24 +11,42 @@ import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; const VALIDATORS_WITH_MISSED_PROPOSALS_COUNT_THRESHOLD = 1 / 3; export class CriticalMissedProposes extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalMissedProposes.name, config, storage, operators); + protected readonly proposes: NOsProposesStats[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + proposes: NOsProposesStats[], + ) { + const name = CriticalMissedProposes.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.proposes = proposes; } - async alertRule(epoch: Epoch): Promise { + alertRule(): AlertRuleResult { + const alertParams = this.config.getCriticalAlertParamForModule(this.moduleIndex); const result: AlertRuleResult = {}; - const nosStats = await this.storage.getUserNodeOperatorsStats(epoch); - const proposes = await this.storage.getUserNodeOperatorsProposesStats(epoch); // ~12h range - for (const noStats of nosStats.filter((o) => o.active_ongoing > this.config.get('CRITICAL_ALERTS_MIN_VAL_COUNT'))) { - const operator = this.operators.find((o) => +noStats.val_nos_module_id == o.module && +noStats.val_nos_id == o.index); - const proposeStats = proposes.find( - (a) => a.val_nos_id != null && +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index, + + const activeOngoingThreshold = alertParams.affectedValCount ?? alertParams.activeValCount.minActiveCount; + const filteredNosStats = this.nosStats.filter((o) => o.active_ongoing >= activeOngoingThreshold); + + for (const noStats of filteredNosStats) { + const operator = this.operators.find((o) => +noStats.val_nos_id === o.index); + const proposeStats = this.proposes.find( + (a) => a.val_nos_id != null && +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index, ); - if (!proposeStats) continue; - if (proposeStats.missed > proposeStats.all * VALIDATORS_WITH_MISSED_PROPOSALS_COUNT_THRESHOLD) { + + if (proposeStats == null) continue; + + if (proposeStats.missed >= proposeStats.all * VALIDATORS_WITH_MISSED_PROPOSALS_COUNT_THRESHOLD) { result[operator.name] = { all: proposeStats.all, missed: proposeStats.missed }; } } + return result; } @@ -51,12 +69,22 @@ export class CriticalMissedProposes extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), - labels: { alertname: this.alertname, severity: 'critical', ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS') }, + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), + labels: { + alertname: this.alertname, + severity: 'critical', + nos_module_id: this.moduleIndex, + ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), + }, annotations: { - summary: `${Object.values(ruleResult).length} Node Operators with CRITICAL count of missed proposes in the last 12 hours`, + summary: `${ + Object.values(ruleResult).length + } Node Operators with CRITICAL count of missed proposes in the last 12 hours in module ${this.moduleIndex}`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.missed} of ${r.all} proposes`), '\n', diff --git a/src/common/alertmanager/alerts/CriticalNegativeDelta.ts b/src/common/alertmanager/alerts/CriticalNegativeDelta.ts index e376d4b8..fb371d21 100644 --- a/src/common/alertmanager/alerts/CriticalNegativeDelta.ts +++ b/src/common/alertmanager/alerts/CriticalNegativeDelta.ts @@ -2,33 +2,57 @@ import { join } from 'lodash'; import { sentAlerts } from 'common/alertmanager'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsNegDeltaCount, NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; -const validatorsWithNegativeDeltaCountThreshold = (quantity: number) => { - return Math.min(quantity / 3, 1000); -}; - export class CriticalNegativeDelta extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalNegativeDelta.name, config, storage, operators); + protected readonly negativeValidatorsCount: NOsValidatorsNegDeltaCount[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + negativeValidatorsCount: NOsValidatorsNegDeltaCount[], + ) { + const name = CriticalNegativeDelta.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.negativeValidatorsCount = negativeValidatorsCount; } - async alertRule(epoch: Epoch): Promise { + alertRule(): AlertRuleResult { + const alertParams = this.config.getCriticalAlertParamForModule(this.moduleIndex); const result: AlertRuleResult = {}; - const nosStats = await this.storage.getUserNodeOperatorsStats(epoch); - const negativeValidatorsCount = await this.storage.getValidatorsCountWithNegativeDelta(epoch); - for (const noStats of nosStats.filter((o) => o.active_ongoing > this.config.get('CRITICAL_ALERTS_MIN_VAL_COUNT'))) { - const operator = this.operators.find((o) => +noStats.val_nos_module_id == o.module && +noStats.val_nos_id == o.index); - const negDelta = negativeValidatorsCount.find((a) => +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index); - if (!negDelta) continue; - if (negDelta.amount > validatorsWithNegativeDeltaCountThreshold(noStats.active_ongoing)) { + + const activeOngoingThreshold = alertParams.affectedValCount ?? alertParams.activeValCount.minActiveCount; + + // If affectedValCount is set, we're not interested in NOs with a number of validators less than this value + // (because for these NOs it is not possible to have a number of affected validators greater than this value). + const filteredNosStats = this.nosStats.filter((o) => o.active_ongoing >= activeOngoingThreshold); + + for (const noStats of filteredNosStats) { + const operator = this.operators.find((o) => +noStats.val_nos_id === o.index); + const negDelta = this.negativeValidatorsCount.find( + (a) => +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index, + ); + + if (negDelta == null) continue; + + const includeToResult = + alertParams.affectedValCount != null + ? negDelta.amount >= alertParams.affectedValCount + : negDelta.amount >= + Math.min(noStats.active_ongoing * alertParams.activeValCount.affectedShare, alertParams.activeValCount.minAffectedCount); + if (includeToResult) { result[operator.name] = { ongoing: noStats.active_ongoing, negDelta: negDelta.amount }; } } + return result; } @@ -52,12 +76,22 @@ export class CriticalNegativeDelta extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), - labels: { alertname: this.alertname, severity: 'critical', ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS') }, + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), + labels: { + alertname: this.alertname, + severity: 'critical', + nos_module_id: this.moduleIndex, + ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), + }, annotations: { - summary: `${Object.values(ruleResult).length} Node Operators with CRITICAL count of validators with negative delta`, + summary: `${Object.values(ruleResult).length} Node Operators with CRITICAL count of validators with negative delta in module ${ + this.moduleIndex + }`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.negDelta} of ${r.ongoing}`), '\n', diff --git a/src/common/alertmanager/alerts/CriticalSlashing.ts b/src/common/alertmanager/alerts/CriticalSlashing.ts index 0ad43d62..d9dd0aef 100644 --- a/src/common/alertmanager/alerts/CriticalSlashing.ts +++ b/src/common/alertmanager/alerts/CriticalSlashing.ts @@ -1,30 +1,43 @@ import { join } from 'lodash'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; export class CriticalSlashing extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalSlashing.name, config, storage, operators); + protected readonly prevNosStats: NOsValidatorsStatusStats[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + prevNosStats: NOsValidatorsStatusStats[], + ) { + const name = CriticalSlashing.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.prevNosStats = prevNosStats; } - async alertRule(epoch: Epoch): Promise { + async alertRule(): Promise { const result: AlertRuleResult = {}; - const currOperators = await this.storage.getUserNodeOperatorsStats(epoch); - const prevOperators = await this.storage.getUserNodeOperatorsStats(epoch - 1); // compare with previous epoch - for (const currOperator of currOperators) { - const operator = this.operators.find((o) => +currOperator.val_nos_module_id == o.module && +currOperator.val_nos_id == o.index); - const prevOperator = prevOperators.find((a) => +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index); + + for (const currOperator of this.nosStats) { + const operator = this.operators.find((o) => +currOperator.val_nos_id === o.index); + const prevOperator = this.prevNosStats.find((a) => +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index); + // if count of slashed validators increased, we should alert about it - const prevSlashed = prevOperator ? prevOperator.slashed : 0; + const prevSlashed = prevOperator != null ? prevOperator.slashed : 0; if (currOperator.slashed > prevSlashed) { result[operator.name] = { ongoing: currOperator.active_ongoing, slashed: currOperator.slashed - prevSlashed }; } } + return result; } @@ -34,12 +47,20 @@ export class CriticalSlashing extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), - labels: { alertname: this.alertname, severity: 'critical', ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS') }, + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), + labels: { + alertname: this.alertname, + severity: 'critical', + nos_module_id: this.moduleIndex, + ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), + }, annotations: { - summary: `${Object.values(ruleResult).length} Node Operators with SLASHED validators`, + summary: `${Object.values(ruleResult).length} Node Operators with SLASHED validators in module ${this.moduleIndex}`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.slashed} of ${r.ongoing}`), '\n', diff --git a/src/common/alertmanager/critical-alerts.service.ts b/src/common/alertmanager/critical-alerts.service.ts index aea0d9a6..758ab94f 100644 --- a/src/common/alertmanager/critical-alerts.service.ts +++ b/src/common/alertmanager/critical-alerts.service.ts @@ -46,29 +46,57 @@ export class CriticalAlertsService { return; } try { - let count = 0; - for (const alert of this.alerts) { + const moduleIndexes = this.registryService.getModuleIndexes(); + const [nosStats, missedAttValidatorsCount, proposes, negativeValidatorsCount, prevNosStats] = await Promise.all([ + this.storage.getUserNodeOperatorsStats(epoch), + this.storage.getValidatorCountWithMissedAttestationsLastNEpoch(epoch), + this.storage.getUserNodeOperatorsProposesStats(epoch), // ~12h range + this.storage.getValidatorsCountWithNegativeDelta(epoch), + this.storage.getUserNodeOperatorsStats(epoch - 1), + ]); + + const alerts = []; + for (const moduleIndex of moduleIndexes) { + const nosStatsForModule = nosStats.filter((o) => +o.val_nos_module_id === moduleIndex); + const operatorsForModule = this.operators.filter((o) => o.module === moduleIndex); + + alerts.push( + ...[ + new CriticalMissedAttestations( + this.config, + this.storage, + operatorsForModule, + moduleIndex, + nosStatsForModule, + missedAttValidatorsCount, + ), + new CriticalMissedProposes(this.config, this.storage, operatorsForModule, moduleIndex, nosStatsForModule, proposes), + new CriticalNegativeDelta( + this.config, + this.storage, + operatorsForModule, + moduleIndex, + nosStatsForModule, + negativeValidatorsCount, + ), + new CriticalSlashing(this.config, this.storage, operatorsForModule, moduleIndex, nosStatsForModule, prevNosStats), + ], + ); + } + + for (const alert of alerts) { const toSend = await alert.toSend(epoch); - if (!toSend) continue; - count++; + if (toSend == null) continue; + await this.fire(toSend.body).then(() => (sentAlerts[alert.alertname] = toSend)); + this.logger.log(`Sent ${alert.alertname} alert`); } - this.logger.log(`Sent critical alerts: ${count}`); } catch (e) { this.logger.error(`Error when trying to processing critical alerts`); this.logger.error(e as Error); } } - private get alerts() { - return [ - new CriticalNegativeDelta(this.config, this.storage, this.operators), - new CriticalMissedProposes(this.config, this.storage, this.operators), - new CriticalMissedAttestations(this.config, this.storage, this.operators), - new CriticalSlashing(this.config, this.storage, this.operators), - ]; - } - private async fire(alert: AlertRequestBody) { got .post(`${this.baseUrl}/api/v1/alerts`, { json: [alert] }) diff --git a/src/common/config/config.service.ts b/src/common/config/config.service.ts index 3eec4ca1..272c020b 100644 --- a/src/common/config/config.service.ts +++ b/src/common/config/config.service.ts @@ -1,6 +1,7 @@ import { ConfigService as ConfigServiceSource } from '@nestjs/config'; import { EnvironmentVariables } from './env.validation'; +import { CriticalAlertParamsForModule } from './interfaces'; export class ConfigService extends ConfigServiceSource { /** @@ -13,4 +14,43 @@ export class ConfigService extends ConfigServiceSource { public get(key: T): EnvironmentVariables[T] { return super.get(key, { infer: true }) as EnvironmentVariables[T]; } + + public getCriticalAlertParamForModule(moduleIndex: number): CriticalAlertParamsForModule { + const minValCount = this.get('CRITICAL_ALERTS_MIN_VAL_COUNT'); + const minActiveValCount = this.get('CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT'); + const minAffectedValCount = this.get('CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT'); + + if (minAffectedValCount[moduleIndex] != null) { + return { + affectedValCount: minAffectedValCount[moduleIndex], + }; + } + + if (minActiveValCount[moduleIndex] != null) { + return { + activeValCount: minActiveValCount[moduleIndex], + }; + } + + if (minAffectedValCount[0] != null) { + return { + affectedValCount: minAffectedValCount[0], + }; + } + + if (minActiveValCount[0] != null) { + return { + activeValCount: minActiveValCount[0], + }; + } + + // default values if the only CRITICAL_ALERTS_MIN_VAL_COUNT is set + return { + activeValCount: { + minActiveCount: minValCount, + affectedShare: 0.33, + minAffectedCount: 1000, + }, + }; + } } diff --git a/src/common/config/env.validation.ts b/src/common/config/env.validation.ts index 54de48e2..a3c88abd 100644 --- a/src/common/config/env.validation.ts +++ b/src/common/config/env.validation.ts @@ -277,9 +277,18 @@ export class EnvironmentVariables { * Critical alerts will be sent for NOs with validators count greater this value */ @IsNumber() + @Min(1) @Transform(({ value }) => parseInt(value, 10), { toClassOnly: true }) public CRITICAL_ALERTS_MIN_VAL_COUNT = 100; + @IsObject() + @Transform(({ value }) => JSON.parse(value), { toClassOnly: true }) + public CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = {}; + + @IsObject() + @Transform(({ value }) => JSON.parse(value), { toClassOnly: true }) + public CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = {}; + @IsString() public CRITICAL_ALERTS_ALERTMANAGER_URL = ''; diff --git a/src/common/config/interfaces/environment.interface.ts b/src/common/config/interfaces/environment.interface.ts index 58dc31f1..79137f3f 100644 --- a/src/common/config/interfaces/environment.interface.ts +++ b/src/common/config/interfaces/environment.interface.ts @@ -18,3 +18,12 @@ export enum LogFormat { json = 'json', simple = 'simple', } + +export interface CriticalAlertParamsForModule { + activeValCount?: { + minActiveCount: number; + affectedShare: number; + minAffectedCount: number; + }; + affectedValCount?: number; +} diff --git a/src/common/consensus-provider/consensus-provider.service.ts b/src/common/consensus-provider/consensus-provider.service.ts index e39a62c7..91db9529 100644 --- a/src/common/consensus-provider/consensus-provider.service.ts +++ b/src/common/consensus-provider/consensus-provider.service.ts @@ -86,7 +86,7 @@ export class ConsensusProviderService { { maxRetries: this.config.get('CL_API_GET_BLOCK_INFO_MAX_RETRIES'), useFallbackOnResolved: (r) => { - if (this.workingMode === WorkingMode.Finalized && r.hasOwnProperty('finalized') && !r.finalized) { + if (this.workingMode === WorkingMode.Finalized && r.finalized != null && !r.finalized) { this.logger.error(`getLatestBlockHeader: slot [${r.data.header.message.slot}] is not finalized`); return true; } @@ -241,7 +241,7 @@ export class ConsensusProviderService { { maxRetries: this.config.get('CL_API_GET_BLOCK_INFO_MAX_RETRIES'), useFallbackOnResolved: (r) => { - if (this.workingMode === WorkingMode.Finalized && blockId !== 'head' && r.hasOwnProperty('finalized') && !r.finalized) { + if (this.workingMode === WorkingMode.Finalized && blockId !== 'head' && r.finalized != null && !r.finalized) { this.logger.error(`getBlockInfo: slot [${r.data.message.slot}] is not finalized`); return true; } @@ -280,7 +280,7 @@ export class ConsensusProviderService { public async getSyncCommitteeInfo(stateId: StateId, epoch: Epoch): Promise { return await this.retryRequest(async (apiURL: string) => this.apiGet(apiURL, this.endpoints.syncCommittee(stateId, epoch)), { useFallbackOnResolved: (r) => { - if (this.workingMode === WorkingMode.Finalized && stateId !== 'head' && r.hasOwnProperty('finalized') && !r.finalized) { + if (this.workingMode === WorkingMode.Finalized && stateId !== 'head' && r.finalized != null && !r.finalized) { this.logger.error(`getSyncCommitteeInfo: state ${stateId} for epoch ${epoch} is not finalized`); return true; } diff --git a/src/common/functions/urljoin.ts b/src/common/functions/urljoin.ts index b5a4bda4..70f6dffd 100644 --- a/src/common/functions/urljoin.ts +++ b/src/common/functions/urljoin.ts @@ -34,14 +34,14 @@ function normalize(strArray: string[]) { if (i > 0) { // Removing the starting slashes for each component but the first. - component = component.replace(/^[\/]+/, ''); + component = component.replace(/^[/]+/, ''); } if (i < strArray.length - 1) { // Removing the ending slashes for each component but the last. - component = component.replace(/[\/]+$/, ''); + component = component.replace(/[/]+$/, ''); } else { // For the last component we will combine multiple slashes to a single one. - component = component.replace(/[\/]+$/, '/'); + component = component.replace(/[/]+$/, '/'); } resultArray.push(component); diff --git a/src/inspector/inspector.service.ts b/src/inspector/inspector.service.ts index da4348df..7cfa45a8 100644 --- a/src/inspector/inspector.service.ts +++ b/src/inspector/inspector.service.ts @@ -80,7 +80,9 @@ export class InspectorService implements OnModuleInit { protected async getEpochDataToProcess(): Promise { const chosen = await this.chooseEpochToProcess(); const latestBeaconBlock = Number((await this.clClient.getLatestBlockHeader(chosen)).header.message.slot); - this.logger.debug(`getEpochDataToProcess: latest block [${latestBeaconBlock}], chosen epoch [${chosen.epoch}], chosen slot [${chosen.slot}]`); + this.logger.debug( + `getEpochDataToProcess: latest block [${latestBeaconBlock}], chosen epoch [${chosen.epoch}], chosen slot [${chosen.slot}]`, + ); let latestEpoch = Math.trunc(latestBeaconBlock / this.config.get('FETCH_INTERVAL_SLOTS')); if (latestEpoch * this.config.get('FETCH_INTERVAL_SLOTS') == latestBeaconBlock) { diff --git a/src/validators-registry/file-source/file-source.service.ts b/src/validators-registry/file-source/file-source.service.ts index 1fbe79d0..05cb981a 100644 --- a/src/validators-registry/file-source/file-source.service.ts +++ b/src/validators-registry/file-source/file-source.service.ts @@ -16,11 +16,23 @@ interface FileContent { } const isValid = (data) => { - let valid = false; - data?.operators?.map((o) => { - o.name && o.keys?.length ? (valid = true) : (valid = false); - }); - return valid; + if (data == null || typeof data !== 'object') { + return false; + } + + for (const m of Object.values(data)) { + if (!Array.isArray(m) || m.length === 0) { + return false; + } + + for (const o of m) { + if (o.name == null || !Array.isArray(o.keys) || o.keys.length === 0) { + return false; + } + } + } + + return true; }; @Injectable() @@ -30,6 +42,7 @@ export class FileSourceService implements RegistrySource { protected data: FileContent; protected lastSuccessDataReadTimestamp: number; + protected moduleIndexes = new Set(); protected operatorsMap = new Map(); protected keysMap = new Map(); @@ -49,6 +62,10 @@ export class FileSourceService implements RegistrySource { this.updateKeysMap(); } + public getModuleIndexes(): number[] { + return [...this.moduleIndexes]; + } + public getOperatorsMap() { return this.operatorsMap; } @@ -64,6 +81,8 @@ export class FileSourceService implements RegistrySource { protected updateOperatorsMap() { this.operatorsMap = new Map(); Object.values(this.data).forEach((m, moduleIndex) => { + this.moduleIndexes.add(moduleIndex + 1); + m.forEach((o, operatorIndex) => { this.operatorsMap.set(`${moduleIndex + 1}_${operatorIndex}`, { index: operatorIndex, module: moduleIndex + 1, name: o.name }); }); diff --git a/src/validators-registry/keysapi-source/keysapi-source.service.ts b/src/validators-registry/keysapi-source/keysapi-source.service.ts index 73e4e511..9223e645 100644 --- a/src/validators-registry/keysapi-source/keysapi-source.service.ts +++ b/src/validators-registry/keysapi-source/keysapi-source.service.ts @@ -30,6 +30,10 @@ export class KeysapiSourceService implements RegistrySource { } } + public getModuleIndexes(): number[] { + return [...this.modules.values()]; + } + public getOperatorsMap(): Map { return this.operatorsMap; } diff --git a/src/validators-registry/lido-source/lido-source.service.ts b/src/validators-registry/lido-source/lido-source.service.ts index def82684..d39ddb4e 100644 --- a/src/validators-registry/lido-source/lido-source.service.ts +++ b/src/validators-registry/lido-source/lido-source.service.ts @@ -37,6 +37,10 @@ export class LidoSourceService implements RegistrySource { } } + public getModuleIndexes(): number[] { + return [this.registryModuleId]; + } + public getOperatorsMap() { return this.operatorsMap; } diff --git a/src/validators-registry/registry-source.interface.ts b/src/validators-registry/registry-source.interface.ts index f0e9b5e8..5a19941d 100644 --- a/src/validators-registry/registry-source.interface.ts +++ b/src/validators-registry/registry-source.interface.ts @@ -18,6 +18,7 @@ export interface RegistrySourceOperator { export interface RegistrySource { update(...args): Promise; + getModuleIndexes(): number[]; getOperatorsMap(): Map; getOperatorKey(pubKey: string): RegistrySourceKey | null; sourceTimestamp(): Promise; diff --git a/src/validators-registry/registry.service.ts b/src/validators-registry/registry.service.ts index 3c959fd4..a56db69e 100644 --- a/src/validators-registry/registry.service.ts +++ b/src/validators-registry/registry.service.ts @@ -24,6 +24,7 @@ export class RegistryService { protected lastTimestamp = 0; + protected moduleIndexes: number[] = []; protected operators = []; protected stuckKeys = []; @@ -41,6 +42,7 @@ export class RegistryService { throw Error(`Validators registry data is too old. Last update - ${lastUpdateTime}`); } this.operators = [...this.source.getOperatorsMap().values()]; + this.moduleIndexes = this.source.getModuleIndexes(); } public getOperatorKey(pubKey: string): RegistrySourceKeyWithOperatorName { @@ -50,6 +52,10 @@ export class RegistryService { return { ...key, operatorName: operator.name }; } + public getModuleIndexes() { + return this.moduleIndexes; + } + public getOperators() { return this.operators; } From 627cbc9f8aac6728bcdc81238562bf7d5bf159ac Mon Sep 17 00:00:00 2001 From: Alexander Lukin Date: Thu, 23 Jan 2025 15:56:34 +0400 Subject: [PATCH 3/3] chore: minor improvements 1. Replace the `any` type in the `AlertRequestBody` interface fields with more specific types. 2. Fix the maximal version of the Alertmanager that is compatible with the current critical alerts code in the `docker-compose`. --- docker-compose.yml | 2 +- src/common/alertmanager/alerts/BasicAlert.ts | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index df4ee3d4..0351e660 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -75,7 +75,7 @@ services: - '8083:8080' alertmanager: - image: prom/alertmanager:v0.24.0 + image: prom/alertmanager:v0.26.0 container_name: alertmanager restart: unless-stopped deploy: diff --git a/src/common/alertmanager/alerts/BasicAlert.ts b/src/common/alertmanager/alerts/BasicAlert.ts index f82c68db..07d46ddd 100644 --- a/src/common/alertmanager/alerts/BasicAlert.ts +++ b/src/common/alertmanager/alerts/BasicAlert.ts @@ -6,8 +6,13 @@ import { RegistrySourceOperator } from 'validators-registry'; export interface AlertRequestBody { startsAt: string; endsAt: string; - labels: any; - annotations: any; + labels: { + [key: string]: string; + }; + annotations: { + summary: string; + description: string; + }; } export interface PreparedToSendAlert {