Skip to content
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
7481ca8
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
4b54c18
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
638668f
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
0acfbe5
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
92cfeed
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
edc2722
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
a7edf5c
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
f140f6e
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
cf0570b
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 15, 2025
a53f9c2
Merge branch 'master' into master
youjie23 Oct 15, 2025
d4ad7c0
Merge branch 'master' into master
wu-sheng Oct 15, 2025
5829a48
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 18, 2025
9b10401
Merge branch 'master' of github.com:youjie23/skywalking
youjie23 Oct 18, 2025
602262d
Merge branch 'master' into master
youjie23 Oct 18, 2025
4688cf7
merge master
youjie23 Oct 25, 2025
f97ad0c
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 25, 2025
239439c
Merge branch 'master' into master
youjie23 Oct 27, 2025
587b2aa
chore(e2e): set allowed times to <=0 for endless trigger simulation
youjie23 Oct 30, 2025
e8b6200
Merge branch 'master' of github.com:youjie23/skywalking
youjie23 Oct 30, 2025
6b1f926
Merge branch 'master' into master
wu-sheng Oct 30, 2025
88d2c85
Merge branch 'master' into master
wu-sheng Oct 31, 2025
783ac8b
Merge branch 'master' into master
wu-sheng Nov 1, 2025
c4da5d2
chore:add logs for troubleshooting
youjie23 Nov 6, 2025
c080b31
Merge branch 'master' of github.com:youjie23/skywalking
youjie23 Nov 6, 2025
c6a8d83
chore:add logs for troubleshooting
youjie23 Nov 6, 2025
9c8651c
Revert "chore:add logs for troubleshooting"
youjie23 Nov 6, 2025
7c2b0f5
chore: remove the commented-out code
youjie23 Nov 6, 2025
4dcff48
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Nov 9, 2025
5307baf
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Nov 9, 2025
6ff7817
Merge branch 'master' into master
youjie23 Nov 10, 2025
ca113a5
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Nov 12, 2025
f65414b
Merge branch 'master' into master
youjie23 Nov 12, 2025
4c1e2c6
fix Copilot review and CI fail
youjie23 Nov 12, 2025
06a96e8
Merge branch 'master' into master
youjie23 Nov 13, 2025
37cc68a
Merge branch 'master' into master
youjie23 Nov 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/skywalking.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ jobs:
if: matrix.test.docker != null
run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }}
- name: ${{ matrix.test.name }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
with:
e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }}
- if: ${{ failure() }}
Expand Down Expand Up @@ -844,7 +844,7 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: ${{ matrix.test.name }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
env:
ISTIO_VERSION: ${{ matrix.versions.istio }}
KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }}
Expand Down Expand Up @@ -905,7 +905,7 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: ${{ matrix.test.name }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
env:
ISTIO_VERSION: ${{ matrix.versions.istio }}
KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }}
Expand Down Expand Up @@ -968,7 +968,7 @@ jobs:
shell: bash
run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package
- name: Java version ${{ matrix.java-version }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
env:
SW_AGENT_JDK_VERSION: ${{ matrix.java-version }}
with:
Expand Down Expand Up @@ -1064,7 +1064,7 @@ jobs:
# fi
# docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v
# - name: ${{ matrix.test.name }}
# uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
# uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
# with:
# e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }}
# - if: ${{ failure() }}
Expand Down
1 change: 1 addition & 0 deletions docs/en/changes/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
* BanyanDB: support add group prefix (namespace) for BanyanDB groups.
* BanyanDB: fix when setting `@BanyanDB.TimestampColumn`, the column should not be indexed.
* OAP Self Observability: make Trace analysis metrics separate by label `protocol`, add Zipkin span dropped metrics.
* Enhance the alarm kernel with recovered status notification capability
* BanyanDB: Move data write logic from BanyanDB Java Client to OAP and support observe metrics for write operations.
* Self Observability: add write latency metrics for BanyanDB and ElasticSearch.
* Fix the malfunctioning alarm feature of MAL metrics due to unknown metadata in L2 aggregate worker.
Expand Down
123 changes: 108 additions & 15 deletions docs/en/setup/backend/backend-alarm.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ The metrics names in the expression could be found in the [list of all potential
If the hook name is not specified, the global hook will be used.
- **Silence period**. After the alarm is triggered at Time-N (TN), there will be silence during the **TN -> TN + period**.
By default, it works in the same manner as **period**. The same Alarm (having the same ID in the same metrics name) may only be triggered once within a period.
- **Recovery observation period**. Defines the number of consecutive periods that the alarm condition must remain false before the alarm is considered recovered. When the alarm condition becomes false, the system enters an observation period. If the condition remains false for the specified number of periods, a recovery notification is sent. If the condition becomes true again during the observation period, the alarm returns to the FIRING state.
The default value is 0, which means immediate recovery notification when the condition becomes false.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

About the default value, we could change the default rules into 0, and are considered as immediately recovery. But for ppl don't have this config(previous versions' users), we are better to support -1 as default value for config absent, which could provide a more consistent behaviour.
After all, you will send new notifications. The old confiiguation files don't have recovery-text-template or relative url, you should take care of them as normal cases. Otherwise, they are going to fail to boot and upgrade, then have to change all rules manually.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please the make the codes to support recovery period as -1 as no recovery rules. And support no recovery-text-template in hooks as no need to send recovery notifications.

Copy link
Author

@youjie23 youjie23 Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review.
Yes, I also agree that the upgrade should not cause any additional hassle for existing users. I have already addressed this in the latest commit: if neither recovery-text-template nor recovery-urls is configured, no recovery notification will be sent externally (though it will still be persisted to storage), and this will not affect the project's startup or upgrade process.

For example,the key logic in WebhookCallback is as follows: it checks the configured URLs using the getUrls method. For recovery notifications, it specifically uses setting.getRecoveryUrls(). If this list is empty (i.e., not configured), the loop for (final var url : urls)will not execute, thus no external notification is sent.

@Override
public void doAlarmCallback(List<AlarmMessage> alarmMessages, boolean isRecovery) throws Exception {
    // ... existing setup code ...
    List<String> urls = getUrls(setting, isRecovery);
    if (setting == null || CollectionUtils.isEmpty(urls) || CollectionUtils.isEmpty(messages)) {
        continue; // This is where it skips sending if URLs are empty
    }
    for (final var url : urls) {
        // ... send message ...
    }
}

private static List<String> getUrls(WebhookSettings setting, boolean isRecovery) {
    return isRecovery ? setting.getRecoveryUrls() : setting.getUrls(); // Returns an empty list if not configured
}

Copy link
Author

@youjie23 youjie23 Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please the make the codes to support recovery period as -1 as no recovery rules. And support no recovery-text-template in hooks as no need to send recovery notifications.

Sorry. I didn't see this message when I submitted my last reply. I have already implemented the latter behavior (supporting no recovery-text-template in hooks). I'm not entirely sure if the first part (supporting recovery period as -1) is still required. The state transitions and separate storage should not introduce additional side effects, as operations like table creation are automatically handled during the startup process.



Such as for a metric, there is a shifting window as following at T7.

Expand All @@ -52,6 +55,7 @@ Such as for a metric, there is a shifting window as following at T7.
For example, expression `avg(service_resp_time) > 1000`, if the value are `1001, 1001, 1001, 1001, 1001, 1001, 1001`,
the calculation is `((1001 + 10001 + ... + 1001) / 7) > 1000` and the result would be `1`(true). Then the alarm would be triggered.
* In every minute, the window would shift automatically. At T8, Value8 would be cached, and T1/Value1 would be removed from the window.
* If Value8 is 890, the expression will be calculated based on the metric values from T2 to T8, which are `1001, 1001, 1001, 1001, 1001, 1001, 990`. The calculation becomes `((1001 + 1001 + ... + 890) / 7) < 1000`, and the result would be `0`(false). Consequently, the alarm enters an observation period for recovery. If the `Recovery observation period`is not set or is set to `0`, the alarm is considered recovered immediately, and a recovery notification is sent. Otherwise, the system will wait and observe the condition over the specified number of subsequent periods before declaring recovery.

**NOTE**:
* If the expression include labeled metrics and result has multiple labeled value(e.g. `sum(service_percentile{p='50,75'} > 1000) >= 3`), the alarm will be triggered if any of the labeled value result matches 3 times of the condition(P50 > 1000 or P75 > 1000).
Expand All @@ -69,6 +73,8 @@ rules:
period: 10
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 10
# Number of periods to wait before considering the alarm recovered,default as 0.
recovery-observation-period: 2
message: Successful rate of endpoint {name} is lower than 75%
tags:
level: WARNING
Expand Down Expand Up @@ -163,6 +169,14 @@ hooks:
"text": ":alarm_clock: *Apache Skywalking Alarm* \n **%s**."
}
}
recovery-text-template: |-
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":green_heart: *Apache SkyWalking Alarm Recovered* \n **%s**."
}
}
webhooks:
- https://hooks.slack.com/services/x/y/zssss
custom1:
Expand Down Expand Up @@ -192,12 +206,16 @@ webhook:
custom1:
urls:
- http://127.0.0.1/custom1
recovery-urls:
- http://127.0.0.1/custom1
# headers config is provided to add custom configurations or authentications that are required from the server side.
headers:
Authorization: Bearer bearer_token
custom2:
urls:
- http://127.0.0.1/custom2
recovery-urls:
- http://127.0.0.1/custom2
# headers config is provided to add custom configurations or authentications that are required from the server
headers:
Authorization: Basic basic_token
Expand All @@ -213,11 +231,13 @@ webhook:
The JSON format is based on `List<org.apache.skywalking.oap.server.core.alarm.AlarmMessage>` with the following key information:
- **scopeId**, **scope**. All scopes are defined in `org.apache.skywalking.oap.server.core.source.DefaultScopeDefine`.
- **name**. Target scope entity name. Please follow the [entity name definitions](#entity-name).
- **uuid** : The unique identifier (UUID) of the alarm, which is consistent between the trigger and recovery messages.
- **id0**. The ID of the scope entity that matches with the name. When using the relation scope, it is the source entity ID.
- **id1**. When using the relation scope, it is the destination entity ID. Otherwise, it is empty.
- **ruleName**. The rule name configured in `alarm-settings.yml`.
- **alarmMessage**. The alarm text message.
- **startTime**. The alarm time measured in milliseconds, which occurs between the current time and the midnight of January 1, 1970 UTC.
- **startTime**. The time, in milliseconds since the Unix epoch (January 1, 1970 UTC), when the alarm was triggered.
- **recoveryTime**. The time, in milliseconds since the Unix epoch (January 1, 1970 UTC), when the alarm was recovered. This value is `null` if the alarm has not been recovered.
- **tags**. The tags configured in `alarm-settings.yml`.

See the following example:
Expand All @@ -226,11 +246,13 @@ See the following example:
"scopeId": 1,
"scope": "SERVICE",
"name": "serviceA",
"uuid": "uuid1",
"id0": "12",
"id1": "",
"ruleName": "service_resp_time_rule",
"ruleName": "service_resp_time_rule",
"alarmMessage": "alarmMessage xxxx",
"startTime": 1560524171000,
"recoveryTime": 15596606810000,
Copy link

Copilot AI Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The example recovery timestamp 15596606810000 appears to be in the future (approximately year 2464). This should be a realistic timestamp that comes after the startTime value of 1560524171000.

Suggested change
"recoveryTime": 15596606810000,
"recoveryTime": 1560524271000,

Copilot uses AI. Check for mistakes.
"tags": [{
"key": "level",
"value": "WARNING"
Expand All @@ -239,9 +261,10 @@ See the following example:
"scopeId": 1,
"scope": "SERVICE",
"name": "serviceB",
"uuid": "uuid2",
"id0": "23",
"id1": "",
"ruleName": "service_resp_time_rule",
"ruleName": "service_resp_time_rule",
"alarmMessage": "alarmMessage yyy",
"startTime": 1560524171000,
"tags": [{
Expand Down Expand Up @@ -275,6 +298,21 @@ message AlarmMessage {
string alarmMessage = 7;
int64 startTime = 8;
AlarmTags tags = 9;
string uuid = 10;
}

message AlarmRecoveryMessage {
int64 scopeId = 1;
string scope = 2;
string name = 3;
string id0 = 4;
string id1 = 5;
string ruleName = 6;
string alarmMessage = 7;
int64 startTime = 8;
AlarmTags tags = 9;
string uuid = 10;
int64 recoveryTime = 11;
}

message AlarmTags {
Expand Down Expand Up @@ -304,6 +342,14 @@ slack:
"text": ":alarm_clock: *Apache Skywalking Alarm* \n **%s**."
}
}
recovery-text-template: |-
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":green_heart: *Apache SkyWalking Alarm Recovered* \n **%s**."
}
}
webhooks:
- https://hooks.slack.com/services/x/y/z
```
Expand All @@ -322,6 +368,13 @@ wechat:
"content": "Apache SkyWalking Alarm: \n %s."
}
}
recovery-text-template: |-
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm Recovered: \n %s."
}
}
webhooks:
- https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key
```
Expand All @@ -341,6 +394,13 @@ dingtalk:
"content": "Apache SkyWalking Alarm: \n %s."
}
}
recovery-text-template: |-
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm Recovered: \n %s."
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=dummy_token
secret: dummysecret
Expand All @@ -363,6 +423,14 @@ feishu:
},
"ats":"feishu_user_id_1,feishu_user_id_2"
}
recovery-text-template: |-
{
"msg_type": "text",
"content": {
"text": "Apache SkyWalking Alarm Recovered: \n %s."
},
"ats":"feishu_user_id_1,feishu_user_id_2"
}
webhooks:
- url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token
secret: dummysecret
Expand All @@ -376,6 +444,7 @@ welink:
default:
is-default: true
text-template: "Apache SkyWalking Alarm: \n %s."
recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s."
webhooks:
# you may find your own client_id and client_secret in your app, below are dummy, need to change.
- client-id: "dummy_client_id"
Expand All @@ -400,6 +469,7 @@ pagerduty:
default:
is-default: true
text-template: "Apache SkyWalking Alarm: \n %s."
recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s."
integration-keys:
- 5c6d805c9dcf4e03d09dfa81e8789ba1
```
Expand All @@ -415,6 +485,7 @@ discord:
default:
is-default: true
text-template: "Apache SkyWalking Alarm: \n %s."
recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s."
webhooks:
- url: https://discordapp.com/api/webhooks/1008166889777414645/8e0Am4Zb-YGbBqqbiiq0jSHPTEEaHa4j1vIC-zSSm231T8ewGxgY0_XUYpY-k1nN4HBl
username: robot
Expand All @@ -430,15 +501,37 @@ the sliding window will be destroyed and re-created, causing the Alarm of this s

### Keys with data types of alerting rule configuration file

| Alerting element | Configuration property key | Type | Description |
|----------------------|----------------------------|----------------|--------------------|
| Expression | expression | string | MQE expression |
| Include names | include-names | string array | |
| Exclude names | exclude-names | string array | |
| Include names regex | include-names-regex | string | Java regex Pattern |
| Exclude names regex | exclude-names-regex | string | Java regex Pattern |
| Tags | tags | key-value pair | |
| Period | Period | int | |
| Silence period | silence-period | int | |
| Message | message | string | |
| Hooks | hooks | string array | |
| Alerting element | Configuration property key | Type | Description |
| --------------------------- | --------------------------- | -------------- | ------------------ |
| Expression | expression | string | MQE expression |
| Include names | include-names | string array | |
| Exclude names | exclude-names | string array | |
| Include names regex | include-names-regex | string | Java regex Pattern |
| Exclude names regex | exclude-names-regex | string | Java regex Pattern |
| Tags | tags | key-value pair | |
| Period | period | int | |
| Silence period | silence-period | int | |
| Recovery observation period | recovery-observation-period | int | |
| Message | message | string | |
| Hooks | hooks | string array | |

## Alarm state transition
The overall alarm state transition after the introduction of alarm restoration detection and notification since version 10.3.0 is as follows:
```mermaid
stateDiagram-v2
[*] --> NORMAL
NORMAL --> FIRING: Expression match<br/>SilencePeriod reached

FIRING --> SILENCED: Expression match<br/>SilencePeriod reached
FIRING --> OBSERVING_RECOVERY: Expression mismatch<br/>RecoveryObservationPeriod unreached
FIRING --> RECOVERED: Expression mismatch<br/>RecoveryObservationPeriod reached

SILENCED --> OBSERVING_RECOVERY: Expression mismatch<br/>RecoveryObservationPeriod unreached
SILENCED --> RECOVERED: Expression mismatch<br/>RecoveryObservationPeriod reached

OBSERVING_RECOVERY --> FIRING: Expression match<br/>SilencePeriod reached
OBSERVING_RECOVERY --> RECOVERED: Expression mismatch<br/>RecoveryObservationPeriod reached

RECOVERED --> FIRING: Expression match<br/>SilencePeriod reached
RECOVERED --> NORMAL: Expression mismatch
```
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@

package org.apache.skywalking.oap.server.core.alarm.provider;

import java.util.Map;
import java.util.Set;
import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage;
import org.joda.time.LocalDateTime;
import org.joda.time.Minutes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

/**
* Alarm core includes metrics values in certain time windows based on alarm settings. By using its internal timer
Expand Down Expand Up @@ -92,17 +94,44 @@ public void start(List<AlarmCallback> allCallbacks) {
}

if (!alarmMessageList.isEmpty()) {
List<AlarmMessage> alarmFiringMessageList = getAlarmFiringMessageList(alarmMessageList);
List<AlarmMessage> alarmRecoveryMessageList = getAlarmRecoveryMessageList(alarmMessageList);
for (AlarmCallback callback : allCallbacks) {
try {
callback.doAlarm(alarmMessageList);
if (!alarmFiringMessageList.isEmpty()) {
callback.doAlarm(alarmFiringMessageList);
}
if (!alarmRecoveryMessageList.isEmpty()) {
callback.doAlarmRecovery(alarmRecoveryMessageList);
}
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
}
}
}
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
} catch (Throwable e) {
LOGGER.error(e.getMessage(), e);
} finally {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("move to new time and check");
}
}
}, 10, 10, TimeUnit.SECONDS);
}

public static List<AlarmMessage> getAlarmFiringMessageList(List<AlarmMessage> alarmMessageList) {
return alarmMessageList
.stream()
.filter(msg -> !(msg instanceof AlarmRecoveryMessage))
.collect(Collectors.toList());
}

public static List<AlarmMessage> getAlarmRecoveryMessageList(List<AlarmMessage> alarmMessageList) {
return alarmMessageList
.stream()
.filter(msg -> msg instanceof AlarmRecoveryMessage)
.collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public class AlarmRule {
private String excludeNamesRegex;
private int period;
private int silencePeriod;
private int recoveryObservationPeriod;
private String message;
private Map<String, String> tags;
private Set<String> hooks;
Expand Down
Loading
Loading