Skip to content

Commit 2bb488d

Browse files
author
Cairry
committed
🚧 Fix(alert): Kubernetes event eval and generate fingerprint logic
1 parent 27dba70 commit 2bb488d

File tree

11 files changed

+174
-117
lines changed

11 files changed

+174
-117
lines changed

alert/consumer/handle.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func handleAlert(ctx *ctx.Context, processType string, faultCenter models.FaultC
6262
}
6363

6464
if len(routes) == 0 {
65-
logc.Infof(ctx.Ctx, "没用匹配的通知策略, 告警事件名称: %s, 通知对象名称: %s", event.RuleName, noticeData.Name)
65+
logc.Infof(ctx.Ctx, "没有匹配的通知策略, 告警事件名称: %s, 通知对象名称: %s", event.RuleName, noticeData.Name)
6666
}
6767

6868
if mute.IsMuted(mute.MuteParams{

alert/eval/query.go

Lines changed: 53 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,7 @@ func cloudWatch(ctx *ctx.Context, datasourceId, datasourceType string, rule mode
530530
}
531531

532532
func kubernetesEvent(ctx *ctx.Context, datasourceId, datasourceType string, rule models.AlertRule) []string {
533-
var externalLabels map[string]interface{}
533+
// 获取数据源实例信息
534534
datasourceObj, err := ctx.DB.Datasource().GetInstance(datasourceId)
535535
if err != nil {
536536
logc.Errorf(ctx.Ctx, "获取数据源实例失败, 规则ID: %s, 规则名称: %s, 数据源ID: %s, 错误: %v", rule.RuleId, rule.RuleName, datasourceId, err)
@@ -543,66 +543,70 @@ func kubernetesEvent(ctx *ctx.Context, datasourceId, datasourceType string, rule
543543
logc.Errorf(ctx.Ctx, "获取Kubernetes数据源客户端失败, 规则ID: %s, 规则名称: %s, 数据源ID: %s, 错误: %v", rule.RuleId, rule.RuleName, datasourceId, err)
544544
return []string{}
545545
}
546-
externalLabels = cli.(provider.KubernetesClient).GetExternalLabels()
547546

548-
k8sEvent, err := cli.(provider.KubernetesClient).GetWarningEvent(rule.KubernetesConfig.Reason, rule.KubernetesConfig.Scope, rule.KubernetesConfig.Filter)
547+
k8sClient := cli.(provider.KubernetesClient)
548+
externalLabels := k8sClient.GetExternalLabels()
549+
550+
// 查询 Kubernetes 事件
551+
k8sEventMap, err := k8sClient.GetWarningEvent(rule.KubernetesConfig.Reason, rule.KubernetesConfig.Scope, rule.KubernetesConfig.Filter)
549552
if err != nil {
550-
logc.Errorf(ctx.Ctx, "获取Kubernetes警告事件失败, 规则ID: %s, 规则名称: %s, 数据源ID: %s, 资源: %s, 错误: %v", rule.RuleId, rule.RuleName, datasourceId, rule.KubernetesConfig.Resource, err)
553+
logc.Errorf(ctx.Ctx, "获取Kubernetes警告事件失败, 规则ID: %s, 规则名称: %s, 数据源ID: %s, 原因: %s, 错误: %v", rule.RuleId, rule.RuleName, datasourceId, rule.KubernetesConfig.Reason, err)
551554
return []string{}
552555
}
553556

554-
if k8sEvent == nil {
557+
// 无事件返回
558+
if len(k8sEventMap) == 0 {
555559
return []string{}
556560
}
557561

558-
var curFingerprints []string
559-
for _, items := range k8sEvent {
560-
// 不满足阈值,跳过
561-
if len(items) < rule.KubernetesConfig.Value {
562-
continue
563-
}
562+
// 遍历事件组,评估并生成告警
563+
curFingerprints := make([]string, 0, len(k8sEventMap))
564+
for _, eventItems := range k8sEventMap {
565+
for _, k8sEvent := range eventItems {
566+
fingerprint := k8sEvent.GetFingerprint()
564567

565-
// 取第一个作为代表生成告警
566-
item := items[0]
568+
// 构建告警事件
569+
event := process.BuildEvent(rule, func() map[string]interface{} {
570+
metric := k8sEvent.GetMetrics()
571+
metric["rule_name"] = rule.RuleName
572+
metric["severity"] = rule.Severity
573+
metric["fingerprint"] = fingerprint
574+
for k, v := range externalLabels {
575+
metric[k] = v
576+
}
577+
for k, v := range rule.ExternalLabels {
578+
metric[k] = v
579+
}
580+
return metric
581+
})
567582

568-
// 构造告警内容
569-
fingerprint := item.GetFingerprint()
570-
event := process.BuildEvent(rule, func() map[string]interface{} {
571-
metric := item.GetMetrics()
572-
metric["rule_name"] = rule.RuleName
573-
metric["severity"] = rule.Severity
574-
metric["fingerprint"] = fingerprint
575-
metric["value"] = len(items)
576-
for ek, ev := range externalLabels {
577-
metric[ek] = ev
578-
}
579-
for ek, ev := range rule.ExternalLabels {
580-
metric[ek] = ev
583+
// 设置事件基本信息
584+
event.DatasourceId = datasourceId
585+
event.Fingerprint = fingerprint
586+
event.SearchQL = rule.KubernetesConfig.Resource
587+
588+
// 构建注释信息
589+
var msgList []string
590+
for _, e := range eventItems {
591+
msg := strings.ReplaceAll(e.Message, "\"", "'")
592+
msgList = append(msgList, msg)
581593
}
582-
return metric
583-
})
584-
event.DatasourceId = datasourceId
585-
event.Fingerprint = fingerprint
586-
event.SearchQL = rule.KubernetesConfig.Resource
587-
588-
// 拼接注释信息
589-
var msgList []string
590-
for _, e := range items {
591-
msg := strings.ReplaceAll(e.Message, "\"", "'")
592-
msgList = append(msgList, msg)
593-
}
594-
event.Annotations = fmt.Sprintf(
595-
"- 数据源: %s\n- 命名空间: %s\n- 资源类型: %s\n- 资源名称: %s\n- 事件类型: %s\n- 事件详情:\n%s",
596-
datasourceObj.Name,
597-
item.Namespace,
598-
item.InvolvedObject.Kind,
599-
item.InvolvedObject.Name,
600-
item.Reason,
601-
strings.Join(msgList, "\n"),
602-
)
603594

604-
curFingerprints = append(curFingerprints, event.Fingerprint)
605-
process.PushEventToFaultCenter(ctx, &event)
595+
event.Annotations = fmt.Sprintf(
596+
"- 数据源: %s\n- 命名空间: %s\n- 资源类型: %s\n- 资源名称: %s\n- 事件类型: %s\n- 事件详情:\n%s",
597+
datasourceObj.Name,
598+
k8sEvent.Namespace,
599+
k8sEvent.InvolvedObject.Kind,
600+
k8sEvent.InvolvedObject.Name,
601+
k8sEvent.Reason,
602+
strings.Join(msgList, "\n"),
603+
)
604+
605+
// 推送到故障中心
606+
process.PushEventToFaultCenter(ctx, &event)
607+
curFingerprints = append(curFingerprints, fingerprint)
608+
}
609+
606610
}
607611

608612
return curFingerprints

api/user.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
package api
22

33
import (
4-
"github.com/gin-gonic/gin"
54
middleware "watchAlert/internal/middleware"
65
"watchAlert/internal/services"
76
"watchAlert/internal/types"
87
jwtUtils "watchAlert/pkg/tools"
8+
9+
"github.com/gin-gonic/gin"
910
)
1011

1112
type userController struct{}
@@ -61,7 +62,7 @@ func (userController userController) GetUserInfo(ctx *gin.Context) {
6162
r.UserName = username
6263

6364
Service(ctx, func() (interface{}, interface{}) {
64-
return services.UserService.Get(r)
65+
return services.UserService.Info(r)
6566
})
6667
}
6768

@@ -109,7 +110,7 @@ func (userController userController) CheckUser(ctx *gin.Context) {
109110
BindQuery(ctx, r)
110111

111112
Service(ctx, func() (interface{}, interface{}) {
112-
return services.UserService.Get(r)
113+
return services.UserService.Check(r)
113114
})
114115
}
115116

internal/models/alert_current_event.go

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,34 +18,32 @@ const (
1818
)
1919

2020
type AlertCurEvent struct {
21-
TenantId string `json:"tenantId"`
22-
EventId string `json:"eventId"`
23-
RuleGroupId string `json:"rule_group_id"`
24-
RuleId string `json:"rule_id"`
25-
RuleName string `json:"rule_name"`
26-
DatasourceType string `json:"datasource_type"`
27-
DatasourceId string `json:"datasource_id" gorm:"datasource_id"`
28-
Fingerprint string `json:"fingerprint"`
29-
Severity string `json:"severity"`
30-
Labels map[string]interface{} `json:"labels" gorm:"labels;serializer:json"`
31-
SearchQL string `json:"searchQL" gorm:"-"`
32-
EvalInterval int64 `json:"eval_interval"`
33-
ForDuration int64 `json:"for_duration"`
34-
Annotations string `json:"annotations" gorm:"-"`
35-
IsRecovered bool `json:"is_recovered" gorm:"-"`
36-
FirstTriggerTime int64 `json:"first_trigger_time"` // 第一次触发时间
37-
FirstTriggerTimeFormat string `json:"first_trigger_time_format" gorm:"-"`
38-
RepeatNoticeInterval int64 `json:"repeat_notice_interval"` // 重复通知间隔时间
39-
LastEvalTime int64 `json:"last_eval_time" gorm:"-"` // 上一次评估时间
40-
LastSendTime int64 `json:"last_send_time" gorm:"-"` // 上一次发送时间
41-
RecoverTime int64 `json:"recover_time" gorm:"-"` // 恢复时间
42-
RecoverTimeFormat string `json:"recover_time_format" gorm:"-"`
43-
DutyUser string `json:"duty_user" gorm:"-"`
44-
EffectiveTime EffectiveTime `json:"effectiveTime" gorm:"effectiveTime;serializer:json"`
45-
FaultCenterId string `json:"faultCenterId"`
46-
FaultCenter FaultCenter `json:"faultCenter" gorm:"-"`
47-
ConfirmState ConfirmState `json:"confirmState" gorm:"-"`
48-
Status AlertStatus `json:"status" gorm:"-"` // 事件状态
21+
TenantId string `json:"tenantId"`
22+
EventId string `json:"eventId"`
23+
RuleGroupId string `json:"rule_group_id"`
24+
RuleId string `json:"rule_id"`
25+
RuleName string `json:"rule_name"`
26+
DatasourceType string `json:"datasource_type"`
27+
DatasourceId string `json:"datasource_id" gorm:"datasource_id"`
28+
Fingerprint string `json:"fingerprint"`
29+
Severity string `json:"severity"`
30+
Labels map[string]interface{} `json:"labels" gorm:"labels;serializer:json"`
31+
SearchQL string `json:"searchQL" gorm:"-"`
32+
EvalInterval int64 `json:"eval_interval"`
33+
ForDuration int64 `json:"for_duration"`
34+
Annotations string `json:"annotations" gorm:"-"`
35+
IsRecovered bool `json:"is_recovered" gorm:"-"`
36+
FirstTriggerTime int64 `json:"first_trigger_time"` // 第一次触发时间
37+
RepeatNoticeInterval int64 `json:"repeat_notice_interval"` // 重复通知间隔时间
38+
LastEvalTime int64 `json:"last_eval_time" gorm:"-"` // 上一次评估时间
39+
LastSendTime int64 `json:"last_send_time" gorm:"-"` // 上一次发送时间
40+
RecoverTime int64 `json:"recover_time" gorm:"-"` // 恢复时间
41+
DutyUser string `json:"duty_user" gorm:"-"`
42+
EffectiveTime EffectiveTime `json:"effectiveTime" gorm:"effectiveTime;serializer:json"`
43+
FaultCenterId string `json:"faultCenterId"`
44+
FaultCenter FaultCenter `json:"faultCenter" gorm:"-"`
45+
ConfirmState ConfirmState `json:"confirmState" gorm:"-"`
46+
Status AlertStatus `json:"status" gorm:"-"` // 事件状态
4947
}
5048

5149
type ConfirmState struct {

internal/models/rule.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ type EsQueryFilter struct {
8383
type KubernetesConfig struct {
8484
Resource string `json:"resource"`
8585
Reason string `json:"reason"`
86-
Value int `json:"value"`
8786
Filter []string `json:"filter"`
8887
Scope int `json:"scope"`
8988
}

internal/services/tenant.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ func (ts tenantService) AddUsersToTenant(req interface{}) (data interface{}, err
123123

124124
func (ts tenantService) DelUsersOfTenant(req interface{}) (data interface{}, err interface{}) {
125125
r := req.(*types.RequestTenantQuery)
126+
if r.UserID == "admin" {
127+
return nil, fmt.Errorf("admin用户禁止通过接口移除")
128+
}
129+
126130
err = ts.ctx.DB.Tenant().RemoveTenantLinkedUsers(r.ID, r.UserID)
127131
if err != nil {
128132
return nil, err
@@ -141,6 +145,10 @@ func (ts tenantService) GetUsersForTenant(req interface{}) (data interface{}, er
141145

142146
func (ts tenantService) ChangeTenantUserRole(req interface{}) (data interface{}, err interface{}) {
143147
r := req.(*types.RequestTenantChangeUserRole)
148+
if r.UserID == "admin" {
149+
return nil, fmt.Errorf("admin用户角色禁止通过接口修改")
150+
}
151+
144152
err = ts.ctx.DB.Tenant().ChangeTenantUserRole(r.ID, r.UserID, r.UserRole)
145153
if err != nil {
146154
return nil, err

internal/services/user.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ type userService struct {
2020

2121
type InterUserService interface {
2222
List(req interface{}) (interface{}, interface{})
23-
Get(req interface{}) (interface{}, interface{})
23+
Check(req interface{}) (interface{}, interface{})
24+
Info(req interface{}) (interface{}, interface{})
2425
Login(req interface{}) (interface{}, interface{})
2526
Update(req interface{}) (interface{}, interface{})
2627
Register(req interface{}) (interface{}, interface{})
@@ -45,7 +46,18 @@ func (us userService) List(req interface{}) (interface{}, interface{}) {
4546
return data, nil
4647
}
4748

48-
func (us userService) Get(req interface{}) (interface{}, interface{}) {
49+
func (us userService) Check(req interface{}) (interface{}, interface{}) {
50+
r := req.(*types.RequestUserQuery)
51+
52+
_, _, err := us.ctx.DB.User().Get(r.UserId, r.UserName, r.Query)
53+
if err != nil {
54+
return "false", err
55+
}
56+
57+
return "ok", nil
58+
}
59+
60+
func (us userService) Info(req interface{}) (interface{}, interface{}) {
4961
r := req.(*types.RequestUserQuery)
5062

5163
data, _, err := us.ctx.DB.User().Get(r.UserId, r.UserName, r.Query)
@@ -191,6 +203,9 @@ func (us userService) Delete(req interface{}) (interface{}, interface{}) {
191203

192204
func (us userService) ChangePass(req interface{}) (interface{}, interface{}) {
193205
r := req.(*types.RequestUserChangePassword)
206+
if r.UserId == "admin" {
207+
return nil, fmt.Errorf("admin用户密码禁止通过接口修改")
208+
}
194209

195210
arr := md5.Sum([]byte(r.Password))
196211
hashPassword := hex.EncodeToString(arr[:])

pkg/provider/kubernetes.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@ package provider
22

33
import (
44
"context"
5-
"crypto/md5"
6-
"encoding/hex"
75
"fmt"
86
"os"
7+
"strconv"
98
"strings"
109
"time"
1110
"watchAlert/pkg/tools"
@@ -61,7 +60,7 @@ func NewKubernetesClient(ctx context.Context, kubeConfigContent string, labels m
6160
}
6261

6362
func (a KubernetesClient) GetWarningEvent(reason string, scope int, filter []string) (map[string][]KubernetesEventItem, error) {
64-
var warningEvents corev1.EventList
63+
var warningEvents = corev1.EventList{}
6564
cutoffTime := time.Now().Add(-time.Duration(scope) * time.Minute)
6665
opts := metav1.ListOptions{
6766
Limit: 50, // 减少每次请求的数量,防止过多资源占用
@@ -76,7 +75,7 @@ func (a KubernetesClient) GetWarningEvent(reason string, scope int, filter []str
7675

7776
for _, event := range list.Items {
7877
// 检查事件的 Reason 和事件发生时间
79-
eventTime := event.EventTime
78+
eventTime := event.LastTimestamp
8079
if event.Reason == reason && eventTime.After(cutoffTime) {
8180
warningEvents.Items = append(warningEvents.Items, event)
8281
}
@@ -131,16 +130,21 @@ type KubernetesEvent struct {
131130
type KubernetesEventItem corev1.Event
132131

133132
func (a KubernetesEventItem) GetFingerprint() string {
134-
h := md5.New()
135-
s := map[string]interface{}{
133+
labels := map[string]interface{}{
136134
"namespace": a.Namespace,
137135
"resource": a.Reason,
138136
"podName": a.InvolvedObject.Name,
139137
}
140138

141-
h.Write(tools.JsonMarshalToByte(s))
142-
fingerprint := hex.EncodeToString(h.Sum(nil))
143-
return fingerprint
139+
var result uint64
140+
for labelName, labelValue := range labels {
141+
sum := tools.HashNew()
142+
sum = tools.HashAdd(sum, labelName)
143+
sum = tools.HashAdd(sum, fmt.Sprintf("%v", labelValue))
144+
result ^= sum
145+
}
146+
147+
return strconv.FormatUint(result, 10)
144148
}
145149

146150
func (a KubernetesEventItem) GetMetrics() map[string]interface{} {

pkg/provider/logs.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
package provider
22

33
import (
4-
"crypto/md5"
5-
"encoding/hex"
64
"fmt"
5+
"strconv"
76
"strings"
87
"time"
98
"watchAlert/internal/models"
@@ -90,13 +89,19 @@ type Logs struct {
9089
}
9190

9291
func (l Logs) GenerateFingerprint(ruleId string) string {
93-
h := md5.New()
94-
streamString := tools.JsonMarshalToString(map[string]string{
92+
labels := map[string]string{
9593
"ruleId": ruleId,
96-
})
97-
h.Write([]byte(streamString))
98-
fingerprint := hex.EncodeToString(h.Sum(nil))
99-
return fingerprint
94+
}
95+
96+
var result uint64
97+
for labelName, labelValue := range labels {
98+
sum := tools.HashNew()
99+
sum = tools.HashAdd(sum, labelName)
100+
sum = tools.HashAdd(sum, fmt.Sprintf("%v", labelValue))
101+
result ^= sum
102+
}
103+
104+
return strconv.FormatUint(result, 10)
100105
}
101106

102107
func (l Logs) GetAnnotations() map[string]interface{} {

0 commit comments

Comments
 (0)