diff --git a/.env.example.compose b/.env.example.compose index a7f7417c..adc95b69 100644 --- a/.env.example.compose +++ b/.env.example.compose @@ -27,6 +27,3 @@ VALIDATOR_REGISTRY_SOURCE=lido # Critical alerts (optional). # CRITICAL_ALERTS_ALERTMANAGER_URL=http://alertmanager:9093 # CRITICAL_ALERTS_MIN_VAL_COUNT=1 - -# Discord web-hook (optional). -# DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/... diff --git a/.env.example.local b/.env.example.local index b5666484..7bafb0be 100644 --- a/.env.example.local +++ b/.env.example.local @@ -32,6 +32,3 @@ VALIDATOR_REGISTRY_SOURCE=lido # Critical alerts (optional). # CRITICAL_ALERTS_ALERTMANAGER_URL=http://alertmanager:9093 # CRITICAL_ALERTS_MIN_VAL_COUNT=1 - -# Discord web-hook (optional). -# DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/... diff --git a/README.md b/README.md index 15b41bcc..0b2bbf38 100644 --- a/README.md +++ b/README.md @@ -262,7 +262,8 @@ Holesky) this value should be omitted. * **Default:** ./docker/validators/lido_mainnet.db * **Note:** it makes sense to change default value if `VALIDATOR_REGISTRY_SOURCE` is set to "lido" --- -`VALIDATOR_REGISTRY_KEYSAPI_SOURCE_URLS` - Comma-separated list of URLs to [Lido Keys API service](https://github.com/lidofinance/lido-keys-api). +`VALIDATOR_REGISTRY_KEYSAPI_SOURCE_URLS` - Comma-separated list of URLs to +[Lido Keys API service](https://github.com/lidofinance/lido-keys-api). * **Required:** false * **Note:** will be used only if `VALIDATOR_REGISTRY_SOURCE` is set to "keysapi" --- @@ -278,21 +279,25 @@ Holesky) this value should be omitted. * **Required:** false * **Default:** 2 --- -`VALIDATOR_USE_STUCK_KEYS_FILE` - Use a file with list of validators that are stuck and should be excluded from the monitoring metrics. +`VALIDATOR_USE_STUCK_KEYS_FILE` - Use a file with list of validators that are stuck and should be excluded from the +monitoring metrics. * **Required:** false * **Values:** true / false * **Default:** false --- -`VALIDATOR_STUCK_KEYS_FILE_PATH` - Path to file with list of validators that are stuck and should be excluded from the monitoring metrics. +`VALIDATOR_STUCK_KEYS_FILE_PATH` - Path to file with list of validators that are stuck and should be excluded from the +monitoring metrics. * **Required:** false * **Default:** ./docker/validators/stuck_keys.yaml * **Note:** will be used only if `VALIDATOR_USE_STUCK_KEYS_FILE` is true --- -`SYNC_PARTICIPATION_DISTANCE_DOWN_FROM_CHAIN_AVG` - Distance (down) from Blockchain Sync Participation average after which we think that our sync participation is bad. +`SYNC_PARTICIPATION_DISTANCE_DOWN_FROM_CHAIN_AVG` - Distance (down) from Blockchain Sync Participation average after +which we think that our sync participation is bad. * **Required:** false * **Default:** 0 --- -`SYNC_PARTICIPATION_EPOCHS_LESS_THAN_CHAIN_AVG` - Number epochs after which we think that our sync participation is bad and alert about that. +`SYNC_PARTICIPATION_EPOCHS_LESS_THAN_CHAIN_AVG` - Number epochs after which we think that our sync participation is bad +and alert about that. * **Required:** false * **Default:** 3 --- @@ -300,33 +305,180 @@ Holesky) this value should be omitted. * **Required:** false * **Default:** 3 --- -`CRITICAL_ALERTS_ALERTMANAGER_URL` - If passed, application sends additional critical alerts about validators performance to Alertmanager. +`CRITICAL_ALERTS_ALERTMANAGER_URL` - If passed, application sends additional critical alerts about validators +performance to Alertmanager. * **Required:** false --- -`CRITICAL_ALERTS_MIN_VAL_COUNT` - Critical alerts will be sent for Node Operators with validators count greater this value. +`CRITICAL_ALERTS_MIN_VAL_COUNT` - Critical alerts will be sent for Node Operators with validators count greater or equal +to this value. * **Required:** false * **Default:** 100 --- +`CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` - Sets the minimum conditions for triggering critical alerts based on the number +of active validators for node operators in a specific module. + +The value must be in JSON format. Example: +`{ "0": { "minActiveCount": 100, "affectedShare": 0.33, "minAffectedCount": 1000 } }`. + +The numeric key represents the module ID. Settings under the `0` key apply to all modules unless overridden by settings +for specific module IDs. Settings for specific module IDs take precedence over the `0` key. + +A critical alert is sent if: + +* The number of active validators for a node operator meets or exceeds `minActiveCount`. +* The number of affected validators: + * Is at least `affectedShare` of the total validators for the node operator, OR + * Exceeds or equal to `minAffectedCount`. +* Value in the `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` for specific module is not overridden by + `CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`. + +If no settings are provided for a specific module or the 0 key, default values are used: +`{ "minActiveCount": CRITICAL_ALERTS_MIN_VAL_COUNT, "affectedShare": 0.33, "minAffectedCount": 1000 }`. +* **Required:** false +* **Default:** {} +--- +`CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT` - Defines the minimum number of affected validators for a node operator in a +specific module for which a critical alert should be sent. + +The value must be in JSON format, for example: `{ "0": 100, "3": 50 }`. The numeric key represents the module ID. The +value for the key `0` applies to all modules. Values for non-zero keys apply only to the specified module and take +precedence over the `0` key. + +This variable takes priority over `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` and `CRITICAL_ALERTS_MIN_VAL_COUNT`. If no +value is set for a specific module or the `0` key, the rules from the other two variables will apply instead. +* **Required:** false +* **Default:** {} +--- `CRITICAL_ALERTS_ALERTMANAGER_LABELS` - Additional labels for critical alerts. -Must be in JSON string format. Example - '{"a":"valueA","b":"valueB"}'. +Must be in JSON string format. Example: `{ "a": "valueA", "b": "valueB" }`. * **Required:** false * **Default:** {} --- ## Application critical alerts (via Alertmanager) -In addition to alerts based on Prometheus metrics you can receive special critical alerts based on beaconchain aggregates from app. +In addition to alerts based on Prometheus metrics you can receive special critical alerts based on Beacon Chain +aggregates from app. You should pass env var `CRITICAL_ALERTS_ALERTMANAGER_URL=http://:`. -And if `ethereum_validators_monitoring_data_actuality < 1h` it allows you to receive alerts from table bellow +Critical alerts for modules are controlled by three environment variables, listed here with their priority (from lowest +to highest): +``` +CRITICAL_ALERTS_MIN_VAL_COUNT: number; +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT: { + : { + minActiveCount: number, + affectedShare: number, + minAffectedCount: number, + } +}; +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT: { + : number +}; +``` + +The following rules are applied (listed in order of increasing priority, the next rule overrides the previous one). -| Alert name | Description | If fired repeat | If value increased repeat | -|----------------------------|-----------------------------------------------------------------------------------------------------------------|-----------------|---------------------------| -| CriticalSlashing | At least one validator was slashed | instant | - | -| CriticalMissedProposes | More than 1/3 blocks from Node Operator duties was missed in the last 12 hours | every 6h | - | -| CriticalNegativeDelta | More than 1/3 or more than 1000 Node Operator validators with negative balance delta (between current and 6 epochs ago) | every 6h | every 1h | -| CriticalMissedAttestations | More than 1/3 or more than 1000 Node Operator validators with missed attestations in the last {{ BAD_ATTESTATION_EPOCHS }} epochs | every 6h | every 1h | +1. **Global Fallback** (`CRITICAL_ALERTS_MIN_VAL_COUNT`). If this variable is set, it acts as a default for modules by + creating an implicit rule: +``` +{ + "0": { + "minActiveCount": CRITICAL_ALERTS_MIN_VAL_COUNT, + "affectedShare": 0.33, + "minAffectedCount": 1000 + } +} +``` + +2. **Global Rules for Active Validators** (`CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT`). Default rules apply to all modules + (key `0`) unless overridden. +``` +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = { + "0": { + "minActiveCount": , + "affectedShare": <0.xx>, + "minAffectedCount": , + } +} +``` +A critical alert is triggered for a module if **both** conditions are met: +* Active validators exceed or equal to `minActiveCount`. +* Affected validators exceed or equal to either `minAffectedCount` or `affectedShare` of the total active validators. + +3. **Global Rules for Affected Validators** (`CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`). Default rules apply to all + modules (key `0`) unless overridden. +``` +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = { + "0": +} +``` +A critical alert is triggered if the number of affected validators exceeds or equal to this value. + +4. **Per-Module Rules for Active Validators** (`CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT`). If specific module keys are + defined, those values override the global rules for `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` and + `CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`. +``` +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = { + "n": { + "minActiveCount": , + "affectedShare": <0.xx>, + "minAffectedCount": , + } +} +``` +A critical alert is triggered for those modules if **both** conditions are met: + +* Active validators exceed or equal to `minActiveCount`. +* Affected validators exceed or equal either `minAffectedCount` or `affectedShare` of the total validators. + +For modules that don't have keys in the `CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT` the rules defined in the previous steps +are applied. + +5. **Per-Module Rules for Affected Validators** (`CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT`). If specific module keys are + defined, those values override all other rules for the module. +``` +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = { + "n": +} +``` +A critical alert is triggered if the number of affected validators exceeds or equal to the specified value. + +To illustrate these rules let's consider the following sample config: +``` +CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = { + "0": { + "minActiveCount": 100, + "affectedShare": 0.3, + "minAffectedCount": 1000, + }, + "3": { + "minActiveCount": 10, + "affectedShare": 0.5, + "minAffectedCount": 200, + }, +}; +CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = { + "2": 30 +}; +``` +In this case, critical alerts for any modules except 2 and 3 will be triggered for operators with at least 100 active +validators and only if either at least 1000 or 30% of active validators are affected by a critical alert (depending on +what number is less). However, for operators from the 3-rd module, these rules are weakened: a critical alert will be +triggered for operators with at least 10 active validators and only if either 200 or 50% of validators are affected. + +These rules are not applied to the 2-nd module. For this module, critical alerts will be triggered for all operators +with at least 30 affected validators (no matter how many active validators they have). + +If `ethereum_validators_monitoring_data_actuality < 1h` alerts from table bellow are sent. + +| Alert name | Description | If fired repeat | If value increased repeat | +|----------------------------|---------------------------------------------------------------------------------------------------------|-----------------|---------------------------| +| CriticalSlashing | At least one validator was slashed | instant | - | +| CriticalMissedProposes | More than 1/3 blocks from Node Operator duties was missed in the last 12 hours | every 6h | - | +| CriticalNegativeDelta | A certain number of validators with negative balance delta (between current and 6 epochs ago) | every 6h | every 1h | +| CriticalMissedAttestations | A certain number of validators with missed attestations in the last `{{BAD_ATTESTATION_EPOCHS}}` epochs | every 6h | every 1h | ## Application metrics diff --git a/docker-compose.yml b/docker-compose.yml index e8574bbf..0351e660 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,7 +54,7 @@ services: deploy: resources: limits: - memory: 256m + memory: 512m volumes: - ./.volumes/prometheus/:/prometheus - ./docker/prometheus/:/etc/prometheus/ @@ -75,7 +75,7 @@ services: - '8083:8080' alertmanager: - image: prom/alertmanager:latest + image: prom/alertmanager:v0.26.0 container_name: alertmanager restart: unless-stopped deploy: diff --git a/docker/prometheus/alerts_rules.yml b/docker/prometheus/alerts_rules.yml index 908eebcc..e7b257d3 100644 --- a/docker/prometheus/alerts_rules.yml +++ b/docker/prometheus/alerts_rules.yml @@ -9,7 +9,7 @@ groups: annotations: emoji: 🔪 summary: "Operators have slashed validators" - description: 'Number of slashed validators per operator' + description: 'Number of slashed validators per operator.' field_name: '{{ $labels.nos_name }}' field_value: '[{{ $value | printf "%.0f" }}](http://127.0.0.1:8082/d/3wimU2H7h/nodeoperators/?var-nos_name_var={{ urlquery $labels.nos_name }}&from={{ with query "(time() - 1200) * 1000" }}{{ . | first | value | printf "%f" }}{{ end }}&to={{ with query "time() * 1000" }}{{ . | first | value | printf "%f" }}{{ end }})' url: "http://127.0.0.1:8082/d/HRgPmpNnz/validators" @@ -17,16 +17,16 @@ groups: footer_icon_url: "https://cryptologos.cc/logos/steth-steth-logo.png" - alert: DataActuality - expr: absent(ethereum_validators_monitoring_data_actuality) OR (ethereum_validators_monitoring_data_actuality / 1000 > 3600) + expr: ethereum_validators_monitoring_data_actuality > 3600000 OR absent(ethereum_validators_monitoring_data_actuality) for: 1m labels: severity: critical annotations: emoji: ⏳ summary: "Data actuality greater then 1 hour" - resolved_summary: "Data actuality is back to normal and now less then 1 hour" - description: "({{ humanizeDuration $value }}) It's not OK. Please, check app health" - resolved_description: "It's OK" + resolved_summary: "Data actuality is back to normal and now less then 1 hour." + description: "({{ humanizeDuration $value }}) It's not OK. Please, check app health." + resolved_description: "It's OK." url: "http://127.0.0.1:8082/d/HRgPmpNnz/validators" footer_text: 'Epoch • {{ with query "ethereum_validators_monitoring_epoch_number" }}{{ . | first | value | printf "%.0f" }}{{ end }}' footer_icon_url: "https://cryptologos.cc/logos/steth-steth-logo.png" @@ -38,7 +38,7 @@ groups: annotations: emoji: 💸 summary: 'Operators have a negative balance delta' - resolved_summary: 'Operators have a positive balance delta' + resolved_summary: 'Operators have a positive balance delta.' description: 'Number of validators per operator who have a negative balance delta.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -54,7 +54,7 @@ groups: annotations: emoji: 📝❌ summary: 'Operators have missed attestation in last {{ $labels.epoch_interval }} finalized epochs' - resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs' + resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs.' description: 'Number of validators per operator who have missed attestations.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -98,7 +98,7 @@ groups: annotations: emoji: 📥 summary: 'Operators missed block propose in the last finalized epoch' - resolved_summary: 'Operators not missed block propose in the last finalized epoch' + resolved_summary: 'Operators not missed block propose in the last finalized epoch.' description: 'Number of validators per operator who missed block propose.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -114,7 +114,7 @@ groups: annotations: emoji: 🔄 summary: 'Operators sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs' - resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epochs' + resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epochs.' description: 'Number of validators per operator whose sync participation less than average.' resolved_description: 'Number of validators per operator who recovered.' field_name: '{{ $labels.nos_name }}' @@ -129,7 +129,7 @@ groups: severity: critical annotations: emoji: '📈🔄' - summary: 'Operators may get high rewards in the future, but sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs!' + summary: 'Operators may get high rewards in the future, but sync participation less than average in last {{ $labels.epoch_interval }} finalized epochs' resolved_summary: 'Operators sync participation higher or equal than average in last {{ $labels.epoch_interval }} finalized epoch. Now may get high rewards in the future!' description: 'Number of validators per operator whose sync participation less than average.' resolved_description: 'Number of validators per operator who recovered.' @@ -145,7 +145,7 @@ groups: severity: critical annotations: emoji: '📈📝❌' - summary: 'Operators may get high rewards in the future, but missed attestation in last {{ $labels.epoch_interval }} finalized epochs!' + summary: 'Operators may get high rewards in the future, but missed attestation in last {{ $labels.epoch_interval }} finalized epochs' resolved_summary: 'Operators not have missed attestation in last {{ $labels.epoch_interval }} finalized epochs. Now may get high rewards in the future!' description: 'Number of validators per operator who have missed attestations.' resolved_description: 'Number of validators per operator who recovered.' @@ -161,7 +161,7 @@ groups: severity: critical annotations: emoji: '📈📥' - summary: 'Operators may get high rewards in the future, but missed block propose in the last finalized epoch!' + summary: 'Operators may get high rewards in the future, but missed block propose in the last finalized epoch' resolved_summary: 'Operators not missed block propose in the last finalized epoch. Now may get high rewards in the future!' description: 'Number of validators per operator who missed block propose.' resolved_description: 'Number of validators per operator who recovered.' diff --git a/docker/validators/custom_mainnet.yaml b/docker/validators/custom_mainnet.yaml index 5dc7c223..26e08d3f 100644 --- a/docker/validators/custom_mainnet.yaml +++ b/docker/validators/custom_mainnet.yaml @@ -1,9 +1,22 @@ -operators: - - name: Operator1 +module_1: + - name: Operator_1_0 keys: - - "0xa8088b23b6e9eaecb04c7dfd194d9e47df966605a1cf03004b7d671708421da4cb2836447f73a5f25c2cfb567b181f80" - - "0x84f6ffe8d2285b76d5076165cec8b298c8ed3dc123379de8d49ecf2e27137ebe479fec0e667322a450283c990bfe9995" - - name: Operator2 + - "0x800429af2ff9e4581b3a800cec1604de49538a50659c0cbb2b79493b5d888b2b2075f9e7163bc11024088b17c2b78107" + - "0x8004a4ddb445add99be6e41fce54ae0ceba0d802817585c900e3b43d2a35ab09a8b451d02592fa105249af07122887b8" + - name: Operator_1_1 keys: - - "0xa015a5fcd78cb52e2b1f9c1a833868f9da8dfee31c919e8e1c19aa64defdd140390a16d133b500d5a90bc99bca409908" - - "0xb9b74aaec50f74e484862b5b6bf0174ffa7344f2de2b1b89aeb233722d4bc9812ee346d99a6b0740e2c14c1580257247" + - "0x8004d6da4e9228cb0efbf383ce259338d5626029e3f80913ad1c89098d3289977ba10d873cf88c61e1b2572e26fbd318" + - "0x800532e962039d57e63d1da433e26f6bbff8b15f07b90deb5be8038b7f24ddb2d71d2b26a1693a7fb9a7657f3b8b5fef" + +# Optional +module_2: + - name: Operator_2_0 + keys: + - "0x80081580eefc89c95874ca868cb439a0c51b4b6f97483632ea597e4801c47f03a8f45360a44411c2320296c737c89bc6" + - "0x8008b169609ee48ef4bd36c37bb2d0c5f9fe0335f28396d5aa8620409912e16c06b4ae2048542492007a2005928b074c" + - "0x800e4b8fa424ff35feef522592f3e711a46b426320a7dc40044fb02537e0faf25566e47c72172a3020d0c6bc1648ecc8" + - name: Operator_2_1 + keys: + - "0x80096ff18d55b9b08c1778568867210d9110f5a2200962a962846d09a75bfa29177c42b83903ed0cb0b69f8a061e3e11" + - "0x800c8cb0fcd6104cbdf76120352c1651e858eef2fad8142ebca37d26f76a16c5f692f9b987bb22dd6eb5dd0dc9e021a4" + - "0x800cd7cf64998da8d95ac0e864012922904b78cccc28f2fa88f3bf019ecc8779833d1c7e09d62700b14d2b015f002a52" diff --git a/docker/validators/custom_testnet.yaml b/docker/validators/custom_testnet.yaml index c85b729d..37cf2b60 100644 --- a/docker/validators/custom_testnet.yaml +++ b/docker/validators/custom_testnet.yaml @@ -1,9 +1,22 @@ -operators: - - name: Operator1 +module_1: + - name: Operator_1_0 keys: - - "0xafbf5b06e7953b095a9946cc7ee8f2ecf1312878bd196af4d06661bc7718f1ae2d5c9f8b635f5924bf5d8266234607f8" - - "0x925c1f368524be3fa83c52f40151724b38fb4ebfe64f64f70942aa9a307b81843d9514c1d8f3c8689236f0f1ccd6c6d4" - - name: Operator2 + - "0x8000011bc03bbf99ac5964d14d3bb52de983c848cc3734d736235a19715e8cbbd5e963163eb4bd2d8cd473d103b95c12" + - "0x80000b1388d41e2cb346e6a85d94fccc6510a11d5bd91699e156907b53e1f5c265effa87f492b7cba7fe218f232c6c39" + - name: Operator_1_1 keys: - - "0xb5b9b79942fcce7ddd2c3b00dae34e571fb77f0630d4fdeccba3721b6549013b55cbfe643d96cbe920864795c5f01db6" - - "0xb3ddd2b56dbf80ba035d948709099f8ad7241929a051140ce2698fae216293d98c792314c414afb0ed3b849323b523c6" + - "0x800010c6cde9a31d218347c9d042ceff227a1dbec3970336bd8cd6d767fd0f2e587332ef6a3010b1b0f5d04288483d44" + - "0x80001887f6c44f54e043866a6536b940f1c2bdf0a99203f217940fab8684e77fa1c9cc64537464d7d2b681115eec446a" + +# Optional +module_2: + - name: Operator_2_0 + keys: + - "0x80002248327da011001f38ab78e277ed5ddc1448078a1ba3f1cb47fd20f65f6de07808d7c3c96a2a795011b25100cc1d" + - "0x800037d7c5468fb960d7e5cb40c2d9c39d6713676d9bc971e92692759ac7ba5b0f12d034282e0cfd4cf2c1212d38dd2a" + - "0x80003ad67e896cb261a17398e77e474a7ffc7898a40cf004a74ea8d20b2b562ac7906a3a62656bfbc1d3033748cdd972" + - name: Operator_2_1 + keys: + - "0x80004546cdf353788bd0fb2048c80ecaae4dbd72ed1b9e51d90c0457d57f5e3577778a9710f267aa1e50ce0d5df6fa28" + - "0x80008083f7eb1366eaef3992c48e0ced5dadef0e4405c7b9a0a662322847f98022d970e6a13cf12da9d199b7518562f7" + - "0x80009e291a1e81be05ffce78180bb0a240242466af9613ef8dd34a8f1289f9b9dfc2c98c5d40be4d61f1eb4dec559217" diff --git a/src/common/alertmanager/alerts/BasicAlert.ts b/src/common/alertmanager/alerts/BasicAlert.ts index 7299518f..07d46ddd 100644 --- a/src/common/alertmanager/alerts/BasicAlert.ts +++ b/src/common/alertmanager/alerts/BasicAlert.ts @@ -1,13 +1,18 @@ import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsStatusStats } from 'storage/clickhouse/clickhouse.types'; import { RegistrySourceOperator } from 'validators-registry'; export interface AlertRequestBody { startsAt: string; endsAt: string; - labels: any; - annotations: any; + labels: { + [key: string]: string; + }; + annotations: { + summary: string; + description: string; + }; } export interface PreparedToSendAlert { @@ -26,22 +31,33 @@ export abstract class Alert { protected readonly config: ConfigService; protected readonly storage: ClickhouseService; protected readonly operators: RegistrySourceOperator[]; - - protected constructor(name: string, config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { + protected readonly moduleIndex: number; + protected readonly nosStats: NOsValidatorsStatusStats[]; + + protected constructor( + name: string, + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + ) { this.alertname = name; this.config = config; this.storage = storage; this.operators = operators; + this.moduleIndex = moduleIndex; + this.nosStats = nosStats; } - abstract alertRule(bySlot: number): Promise; + abstract alertRule(): AlertRuleResult; abstract sendRule(ruleResult?: AlertRuleResult): boolean; abstract alertBody(ruleResult: AlertRuleResult): AlertRequestBody; - async toSend(epoch: Epoch): Promise { - const ruleResult = await this.alertRule(epoch); + async toSend(): Promise { + const ruleResult = await this.alertRule(); if (this.sendRule(ruleResult)) return { timestamp: this.sendTimestamp, body: this.alertBody(ruleResult), ruleResult }; } } diff --git a/src/common/alertmanager/alerts/CriticalMissedAttestations.ts b/src/common/alertmanager/alerts/CriticalMissedAttestations.ts index 0700e378..d1ee26a3 100644 --- a/src/common/alertmanager/alerts/CriticalMissedAttestations.ts +++ b/src/common/alertmanager/alerts/CriticalMissedAttestations.ts @@ -2,35 +2,57 @@ import { join } from 'lodash'; import { sentAlerts } from 'common/alertmanager'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsByConditionAttestationCount, NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; -const validatorsWithMissedAttestationCountThreshold = (quantity: number) => { - return Math.min(quantity / 3, 1000); -}; - export class CriticalMissedAttestations extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalMissedAttestations.name, config, storage, operators); + protected readonly missedAttValidatorsCount: NOsValidatorsByConditionAttestationCount[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + missedAttValidatorsCount: NOsValidatorsByConditionAttestationCount[], + ) { + const name = CriticalMissedAttestations.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.missedAttValidatorsCount = missedAttValidatorsCount; } - async alertRule(epoch: Epoch): Promise { + alertRule(): AlertRuleResult { + const alertParams = this.config.getCriticalAlertParamForModule(this.moduleIndex); const result: AlertRuleResult = {}; - const nosStats = await this.storage.getUserNodeOperatorsStats(epoch); - const missedAttValidatorsCount = await this.storage.getValidatorCountWithMissedAttestationsLastNEpoch(epoch); - for (const noStats of nosStats.filter((o) => o.active_ongoing > this.config.get('CRITICAL_ALERTS_MIN_VAL_COUNT'))) { - const operator = this.operators.find((o) => +noStats.val_nos_module_id == o.module && +noStats.val_nos_id == o.index); - const missedAtt = missedAttValidatorsCount.find( - (a) => a.val_nos_id != null && +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index, + + const activeOngoingThreshold = alertParams.affectedValCount ?? alertParams.activeValCount.minActiveCount; + + // If affectedValCount is set, we're not interested in NOs with a number of validators less than this value + // (because for these NOs it is not possible to have a number of affected validators greater than this value). + const filteredNosStats = this.nosStats.filter((o) => o.active_ongoing >= activeOngoingThreshold); + + for (const noStats of filteredNosStats) { + const operator = this.operators.find((o) => +noStats.val_nos_id === o.index); + const missedAtt = this.missedAttValidatorsCount.find( + (a) => a.val_nos_id != null && +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index, ); - if (!missedAtt) continue; - if (missedAtt.amount > validatorsWithMissedAttestationCountThreshold(noStats.active_ongoing)) { + + if (missedAtt == null) continue; + + const includeToResult = + alertParams.affectedValCount != null + ? missedAtt.amount >= alertParams.affectedValCount + : missedAtt.amount >= + Math.min(noStats.active_ongoing * alertParams.activeValCount.affectedShare, alertParams.activeValCount.minAffectedCount); + if (includeToResult) { result[operator.name] = { ongoing: noStats.active_ongoing, missedAtt: missedAtt.amount }; } } + return result; } @@ -54,12 +76,16 @@ export class CriticalMissedAttestations extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), labels: { alertname: this.alertname, severity: 'critical', + nos_module_id: this.moduleIndex.toString(), ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), }, annotations: { @@ -67,7 +93,7 @@ export class CriticalMissedAttestations extends Alert { Object.values(ruleResult).length } Node Operators with CRITICAL count of validators with missed attestations in the last ${this.config.get( 'BAD_ATTESTATION_EPOCHS', - )} epoch`, + )} epoch in module ${this.moduleIndex}`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.missedAtt} of ${r.ongoing}`), '\n', diff --git a/src/common/alertmanager/alerts/CriticalMissedProposes.ts b/src/common/alertmanager/alerts/CriticalMissedProposes.ts index 73682c4d..e92cf4ca 100644 --- a/src/common/alertmanager/alerts/CriticalMissedProposes.ts +++ b/src/common/alertmanager/alerts/CriticalMissedProposes.ts @@ -2,8 +2,8 @@ import { join } from 'lodash'; import { sentAlerts } from 'common/alertmanager'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsProposesStats, NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; @@ -11,24 +11,42 @@ import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; const VALIDATORS_WITH_MISSED_PROPOSALS_COUNT_THRESHOLD = 1 / 3; export class CriticalMissedProposes extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalMissedProposes.name, config, storage, operators); + protected readonly proposes: NOsProposesStats[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + proposes: NOsProposesStats[], + ) { + const name = CriticalMissedProposes.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.proposes = proposes; } - async alertRule(epoch: Epoch): Promise { + alertRule(): AlertRuleResult { + const alertParams = this.config.getCriticalAlertParamForModule(this.moduleIndex); const result: AlertRuleResult = {}; - const nosStats = await this.storage.getUserNodeOperatorsStats(epoch); - const proposes = await this.storage.getUserNodeOperatorsProposesStats(epoch); // ~12h range - for (const noStats of nosStats.filter((o) => o.active_ongoing > this.config.get('CRITICAL_ALERTS_MIN_VAL_COUNT'))) { - const operator = this.operators.find((o) => +noStats.val_nos_module_id == o.module && +noStats.val_nos_id == o.index); - const proposeStats = proposes.find( - (a) => a.val_nos_id != null && +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index, + + const activeOngoingThreshold = alertParams.affectedValCount ?? alertParams.activeValCount.minActiveCount; + const filteredNosStats = this.nosStats.filter((o) => o.active_ongoing >= activeOngoingThreshold); + + for (const noStats of filteredNosStats) { + const operator = this.operators.find((o) => +noStats.val_nos_id === o.index); + const proposeStats = this.proposes.find( + (a) => a.val_nos_id != null && +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index, ); - if (!proposeStats) continue; - if (proposeStats.missed > proposeStats.all * VALIDATORS_WITH_MISSED_PROPOSALS_COUNT_THRESHOLD) { + + if (proposeStats == null) continue; + + if (proposeStats.missed >= proposeStats.all * VALIDATORS_WITH_MISSED_PROPOSALS_COUNT_THRESHOLD) { result[operator.name] = { all: proposeStats.all, missed: proposeStats.missed }; } } + return result; } @@ -51,12 +69,22 @@ export class CriticalMissedProposes extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), - labels: { alertname: this.alertname, severity: 'critical', ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS') }, + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), + labels: { + alertname: this.alertname, + severity: 'critical', + nos_module_id: this.moduleIndex.toString(), + ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), + }, annotations: { - summary: `${Object.values(ruleResult).length} Node Operators with CRITICAL count of missed proposes in the last 12 hours`, + summary: `${ + Object.values(ruleResult).length + } Node Operators with CRITICAL count of missed proposes in the last 12 hours in module ${this.moduleIndex}`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.missed} of ${r.all} proposes`), '\n', diff --git a/src/common/alertmanager/alerts/CriticalNegativeDelta.ts b/src/common/alertmanager/alerts/CriticalNegativeDelta.ts index e376d4b8..35c8da25 100644 --- a/src/common/alertmanager/alerts/CriticalNegativeDelta.ts +++ b/src/common/alertmanager/alerts/CriticalNegativeDelta.ts @@ -2,33 +2,57 @@ import { join } from 'lodash'; import { sentAlerts } from 'common/alertmanager'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsNegDeltaCount, NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; -const validatorsWithNegativeDeltaCountThreshold = (quantity: number) => { - return Math.min(quantity / 3, 1000); -}; - export class CriticalNegativeDelta extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalNegativeDelta.name, config, storage, operators); + protected readonly negativeValidatorsCount: NOsValidatorsNegDeltaCount[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + negativeValidatorsCount: NOsValidatorsNegDeltaCount[], + ) { + const name = CriticalNegativeDelta.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.negativeValidatorsCount = negativeValidatorsCount; } - async alertRule(epoch: Epoch): Promise { + alertRule(): AlertRuleResult { + const alertParams = this.config.getCriticalAlertParamForModule(this.moduleIndex); const result: AlertRuleResult = {}; - const nosStats = await this.storage.getUserNodeOperatorsStats(epoch); - const negativeValidatorsCount = await this.storage.getValidatorsCountWithNegativeDelta(epoch); - for (const noStats of nosStats.filter((o) => o.active_ongoing > this.config.get('CRITICAL_ALERTS_MIN_VAL_COUNT'))) { - const operator = this.operators.find((o) => +noStats.val_nos_module_id == o.module && +noStats.val_nos_id == o.index); - const negDelta = negativeValidatorsCount.find((a) => +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index); - if (!negDelta) continue; - if (negDelta.amount > validatorsWithNegativeDeltaCountThreshold(noStats.active_ongoing)) { + + const activeOngoingThreshold = alertParams.affectedValCount ?? alertParams.activeValCount.minActiveCount; + + // If affectedValCount is set, we're not interested in NOs with a number of validators less than this value + // (because for these NOs it is not possible to have a number of affected validators greater than this value). + const filteredNosStats = this.nosStats.filter((o) => o.active_ongoing >= activeOngoingThreshold); + + for (const noStats of filteredNosStats) { + const operator = this.operators.find((o) => +noStats.val_nos_id === o.index); + const negDelta = this.negativeValidatorsCount.find( + (a) => +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index, + ); + + if (negDelta == null) continue; + + const includeToResult = + alertParams.affectedValCount != null + ? negDelta.amount >= alertParams.affectedValCount + : negDelta.amount >= + Math.min(noStats.active_ongoing * alertParams.activeValCount.affectedShare, alertParams.activeValCount.minAffectedCount); + if (includeToResult) { result[operator.name] = { ongoing: noStats.active_ongoing, negDelta: negDelta.amount }; } } + return result; } @@ -52,12 +76,22 @@ export class CriticalNegativeDelta extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), - labels: { alertname: this.alertname, severity: 'critical', ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS') }, + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), + labels: { + alertname: this.alertname, + severity: 'critical', + nos_module_id: this.moduleIndex.toString(), + ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), + }, annotations: { - summary: `${Object.values(ruleResult).length} Node Operators with CRITICAL count of validators with negative delta`, + summary: `${Object.values(ruleResult).length} Node Operators with CRITICAL count of validators with negative delta in module ${ + this.moduleIndex + }`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.negDelta} of ${r.ongoing}`), '\n', diff --git a/src/common/alertmanager/alerts/CriticalSlashing.ts b/src/common/alertmanager/alerts/CriticalSlashing.ts index 0ad43d62..006e1687 100644 --- a/src/common/alertmanager/alerts/CriticalSlashing.ts +++ b/src/common/alertmanager/alerts/CriticalSlashing.ts @@ -1,30 +1,43 @@ import { join } from 'lodash'; import { ConfigService } from 'common/config'; -import { Epoch } from 'common/consensus-provider/types'; import { ClickhouseService } from 'storage'; +import { NOsValidatorsStatusStats } from 'storage/clickhouse'; import { RegistrySourceOperator } from 'validators-registry'; import { Alert, AlertRequestBody, AlertRuleResult } from './BasicAlert'; export class CriticalSlashing extends Alert { - constructor(config: ConfigService, storage: ClickhouseService, operators: RegistrySourceOperator[]) { - super(CriticalSlashing.name, config, storage, operators); + protected readonly prevNosStats: NOsValidatorsStatusStats[]; + + constructor( + config: ConfigService, + storage: ClickhouseService, + operators: RegistrySourceOperator[], + moduleIndex: number, + nosStats: NOsValidatorsStatusStats[], + prevNosStats: NOsValidatorsStatusStats[], + ) { + const name = CriticalSlashing.name + 'Module' + moduleIndex; + super(name, config, storage, operators, moduleIndex, nosStats); + + this.prevNosStats = prevNosStats; } - async alertRule(epoch: Epoch): Promise { + async alertRule(): Promise { const result: AlertRuleResult = {}; - const currOperators = await this.storage.getUserNodeOperatorsStats(epoch); - const prevOperators = await this.storage.getUserNodeOperatorsStats(epoch - 1); // compare with previous epoch - for (const currOperator of currOperators) { - const operator = this.operators.find((o) => +currOperator.val_nos_module_id == o.module && +currOperator.val_nos_id == o.index); - const prevOperator = prevOperators.find((a) => +a.val_nos_module_id == operator.module && +a.val_nos_id == operator.index); + + for (const currOperator of this.nosStats) { + const operator = this.operators.find((o) => +currOperator.val_nos_id === o.index); + const prevOperator = this.prevNosStats.find((a) => +a.val_nos_module_id === operator.module && +a.val_nos_id === operator.index); + // if count of slashed validators increased, we should alert about it - const prevSlashed = prevOperator ? prevOperator.slashed : 0; + const prevSlashed = prevOperator != null ? prevOperator.slashed : 0; if (currOperator.slashed > prevSlashed) { result[operator.name] = { ongoing: currOperator.active_ongoing, slashed: currOperator.slashed - prevSlashed }; } } + return result; } @@ -34,12 +47,20 @@ export class CriticalSlashing extends Alert { } alertBody(ruleResult: AlertRuleResult): AlertRequestBody { + const timestampDate = new Date(this.sendTimestamp); + const timestampDatePlusTwoMins = new Date(this.sendTimestamp).setMinutes(timestampDate.getMinutes() + 2); + return { - startsAt: new Date(this.sendTimestamp).toISOString(), - endsAt: new Date(new Date(this.sendTimestamp).setMinutes(new Date(this.sendTimestamp).getMinutes() + 2)).toISOString(), - labels: { alertname: this.alertname, severity: 'critical', ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS') }, + startsAt: timestampDate.toISOString(), + endsAt: new Date(timestampDatePlusTwoMins).toISOString(), + labels: { + alertname: this.alertname, + severity: 'critical', + nos_module_id: this.moduleIndex.toString(), + ...this.config.get('CRITICAL_ALERTS_ALERTMANAGER_LABELS'), + }, annotations: { - summary: `${Object.values(ruleResult).length} Node Operators with SLASHED validators`, + summary: `${Object.values(ruleResult).length} Node Operators with SLASHED validators in module ${this.moduleIndex}`, description: join( Object.entries(ruleResult).map(([o, r]) => `${o}: ${r.slashed} of ${r.ongoing}`), '\n', diff --git a/src/common/alertmanager/critical-alerts.service.ts b/src/common/alertmanager/critical-alerts.service.ts index aea0d9a6..758ab94f 100644 --- a/src/common/alertmanager/critical-alerts.service.ts +++ b/src/common/alertmanager/critical-alerts.service.ts @@ -46,29 +46,57 @@ export class CriticalAlertsService { return; } try { - let count = 0; - for (const alert of this.alerts) { + const moduleIndexes = this.registryService.getModuleIndexes(); + const [nosStats, missedAttValidatorsCount, proposes, negativeValidatorsCount, prevNosStats] = await Promise.all([ + this.storage.getUserNodeOperatorsStats(epoch), + this.storage.getValidatorCountWithMissedAttestationsLastNEpoch(epoch), + this.storage.getUserNodeOperatorsProposesStats(epoch), // ~12h range + this.storage.getValidatorsCountWithNegativeDelta(epoch), + this.storage.getUserNodeOperatorsStats(epoch - 1), + ]); + + const alerts = []; + for (const moduleIndex of moduleIndexes) { + const nosStatsForModule = nosStats.filter((o) => +o.val_nos_module_id === moduleIndex); + const operatorsForModule = this.operators.filter((o) => o.module === moduleIndex); + + alerts.push( + ...[ + new CriticalMissedAttestations( + this.config, + this.storage, + operatorsForModule, + moduleIndex, + nosStatsForModule, + missedAttValidatorsCount, + ), + new CriticalMissedProposes(this.config, this.storage, operatorsForModule, moduleIndex, nosStatsForModule, proposes), + new CriticalNegativeDelta( + this.config, + this.storage, + operatorsForModule, + moduleIndex, + nosStatsForModule, + negativeValidatorsCount, + ), + new CriticalSlashing(this.config, this.storage, operatorsForModule, moduleIndex, nosStatsForModule, prevNosStats), + ], + ); + } + + for (const alert of alerts) { const toSend = await alert.toSend(epoch); - if (!toSend) continue; - count++; + if (toSend == null) continue; + await this.fire(toSend.body).then(() => (sentAlerts[alert.alertname] = toSend)); + this.logger.log(`Sent ${alert.alertname} alert`); } - this.logger.log(`Sent critical alerts: ${count}`); } catch (e) { this.logger.error(`Error when trying to processing critical alerts`); this.logger.error(e as Error); } } - private get alerts() { - return [ - new CriticalNegativeDelta(this.config, this.storage, this.operators), - new CriticalMissedProposes(this.config, this.storage, this.operators), - new CriticalMissedAttestations(this.config, this.storage, this.operators), - new CriticalSlashing(this.config, this.storage, this.operators), - ]; - } - private async fire(alert: AlertRequestBody) { got .post(`${this.baseUrl}/api/v1/alerts`, { json: [alert] }) diff --git a/src/common/config/config.service.ts b/src/common/config/config.service.ts index 3eec4ca1..272c020b 100644 --- a/src/common/config/config.service.ts +++ b/src/common/config/config.service.ts @@ -1,6 +1,7 @@ import { ConfigService as ConfigServiceSource } from '@nestjs/config'; import { EnvironmentVariables } from './env.validation'; +import { CriticalAlertParamsForModule } from './interfaces'; export class ConfigService extends ConfigServiceSource { /** @@ -13,4 +14,43 @@ export class ConfigService extends ConfigServiceSource { public get(key: T): EnvironmentVariables[T] { return super.get(key, { infer: true }) as EnvironmentVariables[T]; } + + public getCriticalAlertParamForModule(moduleIndex: number): CriticalAlertParamsForModule { + const minValCount = this.get('CRITICAL_ALERTS_MIN_VAL_COUNT'); + const minActiveValCount = this.get('CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT'); + const minAffectedValCount = this.get('CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT'); + + if (minAffectedValCount[moduleIndex] != null) { + return { + affectedValCount: minAffectedValCount[moduleIndex], + }; + } + + if (minActiveValCount[moduleIndex] != null) { + return { + activeValCount: minActiveValCount[moduleIndex], + }; + } + + if (minAffectedValCount[0] != null) { + return { + affectedValCount: minAffectedValCount[0], + }; + } + + if (minActiveValCount[0] != null) { + return { + activeValCount: minActiveValCount[0], + }; + } + + // default values if the only CRITICAL_ALERTS_MIN_VAL_COUNT is set + return { + activeValCount: { + minActiveCount: minValCount, + affectedShare: 0.33, + minAffectedCount: 1000, + }, + }; + } } diff --git a/src/common/config/env.validation.ts b/src/common/config/env.validation.ts index 54de48e2..a3c88abd 100644 --- a/src/common/config/env.validation.ts +++ b/src/common/config/env.validation.ts @@ -277,9 +277,18 @@ export class EnvironmentVariables { * Critical alerts will be sent for NOs with validators count greater this value */ @IsNumber() + @Min(1) @Transform(({ value }) => parseInt(value, 10), { toClassOnly: true }) public CRITICAL_ALERTS_MIN_VAL_COUNT = 100; + @IsObject() + @Transform(({ value }) => JSON.parse(value), { toClassOnly: true }) + public CRITICAL_ALERTS_MIN_ACTIVE_VAL_COUNT = {}; + + @IsObject() + @Transform(({ value }) => JSON.parse(value), { toClassOnly: true }) + public CRITICAL_ALERTS_MIN_AFFECTED_VAL_COUNT = {}; + @IsString() public CRITICAL_ALERTS_ALERTMANAGER_URL = ''; diff --git a/src/common/config/interfaces/environment.interface.ts b/src/common/config/interfaces/environment.interface.ts index 58dc31f1..79137f3f 100644 --- a/src/common/config/interfaces/environment.interface.ts +++ b/src/common/config/interfaces/environment.interface.ts @@ -18,3 +18,12 @@ export enum LogFormat { json = 'json', simple = 'simple', } + +export interface CriticalAlertParamsForModule { + activeValCount?: { + minActiveCount: number; + affectedShare: number; + minAffectedCount: number; + }; + affectedValCount?: number; +} diff --git a/src/common/consensus-provider/consensus-provider.service.ts b/src/common/consensus-provider/consensus-provider.service.ts index e39a62c7..91db9529 100644 --- a/src/common/consensus-provider/consensus-provider.service.ts +++ b/src/common/consensus-provider/consensus-provider.service.ts @@ -86,7 +86,7 @@ export class ConsensusProviderService { { maxRetries: this.config.get('CL_API_GET_BLOCK_INFO_MAX_RETRIES'), useFallbackOnResolved: (r) => { - if (this.workingMode === WorkingMode.Finalized && r.hasOwnProperty('finalized') && !r.finalized) { + if (this.workingMode === WorkingMode.Finalized && r.finalized != null && !r.finalized) { this.logger.error(`getLatestBlockHeader: slot [${r.data.header.message.slot}] is not finalized`); return true; } @@ -241,7 +241,7 @@ export class ConsensusProviderService { { maxRetries: this.config.get('CL_API_GET_BLOCK_INFO_MAX_RETRIES'), useFallbackOnResolved: (r) => { - if (this.workingMode === WorkingMode.Finalized && blockId !== 'head' && r.hasOwnProperty('finalized') && !r.finalized) { + if (this.workingMode === WorkingMode.Finalized && blockId !== 'head' && r.finalized != null && !r.finalized) { this.logger.error(`getBlockInfo: slot [${r.data.message.slot}] is not finalized`); return true; } @@ -280,7 +280,7 @@ export class ConsensusProviderService { public async getSyncCommitteeInfo(stateId: StateId, epoch: Epoch): Promise { return await this.retryRequest(async (apiURL: string) => this.apiGet(apiURL, this.endpoints.syncCommittee(stateId, epoch)), { useFallbackOnResolved: (r) => { - if (this.workingMode === WorkingMode.Finalized && stateId !== 'head' && r.hasOwnProperty('finalized') && !r.finalized) { + if (this.workingMode === WorkingMode.Finalized && stateId !== 'head' && r.finalized != null && !r.finalized) { this.logger.error(`getSyncCommitteeInfo: state ${stateId} for epoch ${epoch} is not finalized`); return true; } diff --git a/src/common/functions/urljoin.ts b/src/common/functions/urljoin.ts index b5a4bda4..70f6dffd 100644 --- a/src/common/functions/urljoin.ts +++ b/src/common/functions/urljoin.ts @@ -34,14 +34,14 @@ function normalize(strArray: string[]) { if (i > 0) { // Removing the starting slashes for each component but the first. - component = component.replace(/^[\/]+/, ''); + component = component.replace(/^[/]+/, ''); } if (i < strArray.length - 1) { // Removing the ending slashes for each component but the last. - component = component.replace(/[\/]+$/, ''); + component = component.replace(/[/]+$/, ''); } else { // For the last component we will combine multiple slashes to a single one. - component = component.replace(/[\/]+$/, '/'); + component = component.replace(/[/]+$/, '/'); } resultArray.push(component); diff --git a/src/inspector/inspector.service.ts b/src/inspector/inspector.service.ts index da4348df..7cfa45a8 100644 --- a/src/inspector/inspector.service.ts +++ b/src/inspector/inspector.service.ts @@ -80,7 +80,9 @@ export class InspectorService implements OnModuleInit { protected async getEpochDataToProcess(): Promise { const chosen = await this.chooseEpochToProcess(); const latestBeaconBlock = Number((await this.clClient.getLatestBlockHeader(chosen)).header.message.slot); - this.logger.debug(`getEpochDataToProcess: latest block [${latestBeaconBlock}], chosen epoch [${chosen.epoch}], chosen slot [${chosen.slot}]`); + this.logger.debug( + `getEpochDataToProcess: latest block [${latestBeaconBlock}], chosen epoch [${chosen.epoch}], chosen slot [${chosen.slot}]`, + ); let latestEpoch = Math.trunc(latestBeaconBlock / this.config.get('FETCH_INTERVAL_SLOTS')); if (latestEpoch * this.config.get('FETCH_INTERVAL_SLOTS') == latestBeaconBlock) { diff --git a/src/validators-registry/file-source/file-source.service.ts b/src/validators-registry/file-source/file-source.service.ts index 1fbe79d0..05cb981a 100644 --- a/src/validators-registry/file-source/file-source.service.ts +++ b/src/validators-registry/file-source/file-source.service.ts @@ -16,11 +16,23 @@ interface FileContent { } const isValid = (data) => { - let valid = false; - data?.operators?.map((o) => { - o.name && o.keys?.length ? (valid = true) : (valid = false); - }); - return valid; + if (data == null || typeof data !== 'object') { + return false; + } + + for (const m of Object.values(data)) { + if (!Array.isArray(m) || m.length === 0) { + return false; + } + + for (const o of m) { + if (o.name == null || !Array.isArray(o.keys) || o.keys.length === 0) { + return false; + } + } + } + + return true; }; @Injectable() @@ -30,6 +42,7 @@ export class FileSourceService implements RegistrySource { protected data: FileContent; protected lastSuccessDataReadTimestamp: number; + protected moduleIndexes = new Set(); protected operatorsMap = new Map(); protected keysMap = new Map(); @@ -49,6 +62,10 @@ export class FileSourceService implements RegistrySource { this.updateKeysMap(); } + public getModuleIndexes(): number[] { + return [...this.moduleIndexes]; + } + public getOperatorsMap() { return this.operatorsMap; } @@ -64,6 +81,8 @@ export class FileSourceService implements RegistrySource { protected updateOperatorsMap() { this.operatorsMap = new Map(); Object.values(this.data).forEach((m, moduleIndex) => { + this.moduleIndexes.add(moduleIndex + 1); + m.forEach((o, operatorIndex) => { this.operatorsMap.set(`${moduleIndex + 1}_${operatorIndex}`, { index: operatorIndex, module: moduleIndex + 1, name: o.name }); }); diff --git a/src/validators-registry/keysapi-source/keysapi-source.service.ts b/src/validators-registry/keysapi-source/keysapi-source.service.ts index 73e4e511..9223e645 100644 --- a/src/validators-registry/keysapi-source/keysapi-source.service.ts +++ b/src/validators-registry/keysapi-source/keysapi-source.service.ts @@ -30,6 +30,10 @@ export class KeysapiSourceService implements RegistrySource { } } + public getModuleIndexes(): number[] { + return [...this.modules.values()]; + } + public getOperatorsMap(): Map { return this.operatorsMap; } diff --git a/src/validators-registry/lido-source/lido-source.service.ts b/src/validators-registry/lido-source/lido-source.service.ts index def82684..d39ddb4e 100644 --- a/src/validators-registry/lido-source/lido-source.service.ts +++ b/src/validators-registry/lido-source/lido-source.service.ts @@ -37,6 +37,10 @@ export class LidoSourceService implements RegistrySource { } } + public getModuleIndexes(): number[] { + return [this.registryModuleId]; + } + public getOperatorsMap() { return this.operatorsMap; } diff --git a/src/validators-registry/registry-source.interface.ts b/src/validators-registry/registry-source.interface.ts index f0e9b5e8..5a19941d 100644 --- a/src/validators-registry/registry-source.interface.ts +++ b/src/validators-registry/registry-source.interface.ts @@ -18,6 +18,7 @@ export interface RegistrySourceOperator { export interface RegistrySource { update(...args): Promise; + getModuleIndexes(): number[]; getOperatorsMap(): Map; getOperatorKey(pubKey: string): RegistrySourceKey | null; sourceTimestamp(): Promise; diff --git a/src/validators-registry/registry.service.ts b/src/validators-registry/registry.service.ts index 3c959fd4..a56db69e 100644 --- a/src/validators-registry/registry.service.ts +++ b/src/validators-registry/registry.service.ts @@ -24,6 +24,7 @@ export class RegistryService { protected lastTimestamp = 0; + protected moduleIndexes: number[] = []; protected operators = []; protected stuckKeys = []; @@ -41,6 +42,7 @@ export class RegistryService { throw Error(`Validators registry data is too old. Last update - ${lastUpdateTime}`); } this.operators = [...this.source.getOperatorsMap().values()]; + this.moduleIndexes = this.source.getModuleIndexes(); } public getOperatorKey(pubKey: string): RegistrySourceKeyWithOperatorName { @@ -50,6 +52,10 @@ export class RegistryService { return { ...key, operatorName: operator.name }; } + public getModuleIndexes() { + return this.moduleIndexes; + } + public getOperators() { return this.operators; }