Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
7481ca8
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
4b54c18
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
638668f
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
0acfbe5
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
92cfeed
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
edc2722
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
a7edf5c
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
f140f6e
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 11, 2025
cf0570b
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 15, 2025
a53f9c2
Merge branch 'master' into master
youjie23 Oct 15, 2025
d4ad7c0
Merge branch 'master' into master
wu-sheng Oct 15, 2025
5829a48
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 18, 2025
9b10401
Merge branch 'master' of github.com:youjie23/skywalking
youjie23 Oct 18, 2025
602262d
Merge branch 'master' into master
youjie23 Oct 18, 2025
4688cf7
merge master
youjie23 Oct 25, 2025
f97ad0c
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Oct 25, 2025
239439c
Merge branch 'master' into master
youjie23 Oct 27, 2025
587b2aa
chore(e2e): set allowed times to <=0 for endless trigger simulation
youjie23 Oct 30, 2025
e8b6200
Merge branch 'master' of github.com:youjie23/skywalking
youjie23 Oct 30, 2025
6b1f926
Merge branch 'master' into master
wu-sheng Oct 30, 2025
88d2c85
Merge branch 'master' into master
wu-sheng Oct 31, 2025
783ac8b
Merge branch 'master' into master
wu-sheng Nov 1, 2025
c4da5d2
chore:add logs for troubleshooting
youjie23 Nov 6, 2025
c080b31
Merge branch 'master' of github.com:youjie23/skywalking
youjie23 Nov 6, 2025
c6a8d83
chore:add logs for troubleshooting
youjie23 Nov 6, 2025
9c8651c
Revert "chore:add logs for troubleshooting"
youjie23 Nov 6, 2025
7c2b0f5
chore: remove the commented-out code
youjie23 Nov 6, 2025
4dcff48
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Nov 9, 2025
5307baf
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Nov 9, 2025
6ff7817
Merge branch 'master' into master
youjie23 Nov 10, 2025
ca113a5
enhance the alarm kernel with recovered status notification capabilit…
youjie23 Nov 12, 2025
f65414b
Merge branch 'master' into master
youjie23 Nov 12, 2025
4c1e2c6
fix Copilot review and CI fail
youjie23 Nov 12, 2025
06a96e8
Merge branch 'master' into master
youjie23 Nov 13, 2025
37cc68a
Merge branch 'master' into master
youjie23 Nov 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/skywalking.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ jobs:
if: matrix.test.docker != null
run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }}
- name: ${{ matrix.test.name }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
with:
e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }}
- if: ${{ failure() }}
Expand Down Expand Up @@ -844,7 +844,7 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: ${{ matrix.test.name }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
env:
ISTIO_VERSION: ${{ matrix.versions.istio }}
KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }}
Expand Down Expand Up @@ -905,7 +905,7 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: ${{ matrix.test.name }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
env:
ISTIO_VERSION: ${{ matrix.versions.istio }}
KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }}
Expand Down Expand Up @@ -968,7 +968,7 @@ jobs:
shell: bash
run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package
- name: Java version ${{ matrix.java-version }}
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
env:
SW_AGENT_JDK_VERSION: ${{ matrix.java-version }}
with:
Expand Down Expand Up @@ -1064,7 +1064,7 @@ jobs:
# fi
# docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v
# - name: ${{ matrix.test.name }}
# uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
# uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
# with:
# e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }}
# - if: ${{ failure() }}
Expand Down
1 change: 1 addition & 0 deletions docs/en/changes/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
* BanyanDB: support add group prefix (namespace) for BanyanDB groups.
* BanyanDB: fix when setting `@BanyanDB.TimestampColumn`, the column should not be indexed.
* OAP Self Observability: make Trace analysis metrics separate by label `protocol`, add Zipkin span dropped metrics.
* Enhance the alarm kernel with recovered status notification capability
* BanyanDB: Move data write logic from BanyanDB Java Client to OAP and support observe metrics for write operations.
* Self Observability: add write latency metrics for BanyanDB and ElasticSearch.
* Fix the malfunctioning alarm feature of MAL metrics due to unknown metadata in L2 aggregate worker.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@

package org.apache.skywalking.oap.server.core.alarm.provider;

import java.util.Map;
import java.util.Set;
import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage;
import org.joda.time.LocalDateTime;
import org.joda.time.Minutes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

/**
* Alarm core includes metrics values in certain time windows based on alarm settings. By using its internal timer
Expand Down Expand Up @@ -92,17 +94,44 @@ public void start(List<AlarmCallback> allCallbacks) {
}

if (!alarmMessageList.isEmpty()) {
List<AlarmMessage> alarmFiringMessageList = getAlarmFiringMessageList(alarmMessageList);
List<AlarmMessage> alarmRecoveryMessageList = getAlarmRecoveryMessageList(alarmMessageList);
for (AlarmCallback callback : allCallbacks) {
try {
callback.doAlarm(alarmMessageList);
if (!alarmFiringMessageList.isEmpty()) {
callback.doAlarm(alarmFiringMessageList);
}
if (!alarmRecoveryMessageList.isEmpty()) {
callback.doAlarmRecovery(alarmRecoveryMessageList);
}
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
}
}
}
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
} catch (Throwable e) {
LOGGER.error(e.getMessage(), e);
} finally {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("move to new time and check");
}
}
}, 10, 10, TimeUnit.SECONDS);
}

public static List<AlarmMessage> getAlarmFiringMessageList(List<AlarmMessage> alarmMessageList) {
return alarmMessageList
.stream()
.filter(msg -> !(msg instanceof AlarmRecoveryMessage))
.collect(Collectors.toList());
}

public static List<AlarmMessage> getAlarmRecoveryMessageList(List<AlarmMessage> alarmMessageList) {
return alarmMessageList
.stream()
.filter(msg -> msg instanceof AlarmRecoveryMessage)
.collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public class AlarmRule {
private String excludeNamesRegex;
private int period;
private int silencePeriod;
private int recoveryObservationPeriod;
private String message;
private Map<String, String> tags;
private Set<String> hooks;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.skywalking.oap.server.core.alarm.provider;

import java.io.IOException;

import org.apache.skywalking.apm.network.event.v3.Event;
import org.apache.skywalking.apm.network.event.v3.Source;
import org.apache.skywalking.apm.network.event.v3.Type;
Expand All @@ -27,6 +28,7 @@
import org.apache.skywalking.oap.server.core.CoreModule;
import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage;
import org.apache.skywalking.oap.server.core.analysis.IDManager;
import org.apache.skywalking.oap.server.core.analysis.Layer;
import org.apache.skywalking.oap.server.core.query.MetadataQueryService;
Expand All @@ -40,7 +42,6 @@

/**
* EventCallBack: When an alert is present, an event is generated for each alert message. These events are then sent to the internal event analyzer.
*
*/
public class EventHookCallback implements AlarmCallback {

Expand All @@ -50,8 +51,8 @@ public class EventHookCallback implements AlarmCallback {
private MetadataQueryService getMetadataQueryService() {
if (metadataQueryService == null) {
this.metadataQueryService = manager.find(CoreModule.NAME)
.provider()
.getService(MetadataQueryService.class);
.provider()
.getService(MetadataQueryService.class);
}
return metadataQueryService;
}
Expand All @@ -60,11 +61,18 @@ public EventHookCallback(ModuleManager manager) {
this.manager = manager;
}

@Override
public void doAlarm(List<AlarmMessage> alarmMessage) throws Exception {
doAlarmCallback(alarmMessage, false);
}

public void doAlarmRecovery(List<AlarmMessage> alarmRecoveryMessages) throws Exception {
doAlarmCallback(alarmRecoveryMessages, true);
}

private void doAlarmCallback(List<AlarmMessage> alarmMessage, boolean isRecovery) throws Exception {
EventAnalyzerService analyzerService = manager.find(EventAnalyzerModule.NAME).provider().getService(EventAnalyzerService.class);
for (AlarmMessage a : alarmMessage) {
for (Event event : constructCurrentEvent(a)) {
for (Event event : constructCurrentEvent(a, isRecovery)) {
analyzerService.analyze(event);
}
}
Expand All @@ -79,33 +87,33 @@ private String getLayer(String serviceId) throws IOException {
}
}

private List<Event> constructCurrentEvent(AlarmMessage msg) throws IOException {
private List<Event> constructCurrentEvent(AlarmMessage msg, boolean isRecovery) throws IOException {
List<Event> events = new ArrayList<>(2);
long now = System.currentTimeMillis();
Event.Builder builder = Event.newBuilder()
.setUuid(UUID.randomUUID().toString())
.setName("Alarm")
.setStartTime(now - (msg.getPeriod() * 60 * 1000))
.setName(isRecovery ? "AlarmRecovery" : "Alarm")
.setStartTime(isRecovery ? ((AlarmRecoveryMessage) msg).getRecoveryTime() : now - (msg.getPeriod() * 60 * 1000))
.setMessage(msg.getAlarmMessage())
.setType(Type.Error)
.setEndTime(now);
.setType(isRecovery ? Type.Normal : Type.Error)
.setEndTime(isRecovery ? ((AlarmRecoveryMessage) msg).getRecoveryTime() : now);
switch (msg.getScopeId()) {
case DefaultScopeDefine.SERVICE :
case DefaultScopeDefine.SERVICE:
IDManager.ServiceID.ServiceIDDefinition serviceIdDef = IDManager.ServiceID.analysisId(msg.getId0());
builder.setSource(
Source.newBuilder()
.setService(serviceIdDef.getName())
.build()
Source.newBuilder()
.setService(serviceIdDef.getName())
.build()
);
builder.setLayer(getLayer(msg.getId0()));
events.add(builder.build());
break;
case DefaultScopeDefine.SERVICE_RELATION :
case DefaultScopeDefine.SERVICE_RELATION:
IDManager.ServiceID.ServiceIDDefinition sourceServiceIdDef = IDManager.ServiceID.analysisId(msg.getId0());
builder.setSource(
Source.newBuilder()
.setService(sourceServiceIdDef.getName())
.build()
.setService(sourceServiceIdDef.getName())
.build()
);
builder.setLayer(getLayer(msg.getId0()));
events.add(builder.build());
Expand All @@ -118,7 +126,7 @@ private List<Event> constructCurrentEvent(AlarmMessage msg) throws IOException {
builder.setLayer(getLayer(msg.getId1()));
events.add(builder.build());
break;
case DefaultScopeDefine.SERVICE_INSTANCE :
case DefaultScopeDefine.SERVICE_INSTANCE:
IDManager.ServiceInstanceID.InstanceIDDefinition instanceIdDef = IDManager.ServiceInstanceID.analysisId(msg.getId0());
builder.setSource(
Source.newBuilder()
Expand All @@ -129,7 +137,7 @@ private List<Event> constructCurrentEvent(AlarmMessage msg) throws IOException {
builder.setLayer(getLayer(instanceIdDef.getServiceId()));
events.add(builder.build());
break;
case DefaultScopeDefine.SERVICE_INSTANCE_RELATION :
case DefaultScopeDefine.SERVICE_INSTANCE_RELATION:
IDManager.ServiceInstanceID.InstanceIDDefinition sourceInstanceIdDef = IDManager.ServiceInstanceID.analysisId(msg.getId0());
builder.setSource(
Source.newBuilder()
Expand All @@ -149,7 +157,7 @@ private List<Event> constructCurrentEvent(AlarmMessage msg) throws IOException {
builder.setLayer(getLayer(destInstanceIdDef.getServiceId()));
events.add(builder.build());
break;
case DefaultScopeDefine.ENDPOINT :
case DefaultScopeDefine.ENDPOINT:
IDManager.EndpointID.EndpointIDDefinition endpointIDDef = IDManager.EndpointID.analysisId(msg.getId0());
builder.setSource(
Source.newBuilder()
Expand All @@ -160,7 +168,7 @@ private List<Event> constructCurrentEvent(AlarmMessage msg) throws IOException {
builder.setLayer(getLayer(endpointIDDef.getServiceId()));
events.add(builder.build());
break;
case DefaultScopeDefine.ENDPOINT_RELATION :
case DefaultScopeDefine.ENDPOINT_RELATION:
IDManager.EndpointID.EndpointIDDefinition sourceEndpointIDDef = IDManager.EndpointID.analysisId(msg.getId0());
builder.setSource(
Source.newBuilder()
Expand Down
Loading
Loading