Skip to content

Commit 7439ab2

Browse files
committed
feat(inhibit): add inhibition metrics
Add metrics for inhibitor: - alertmanager_inhibitor_duplicate_rule_names - alertmanager_inhibitor_source_alerts_cache_size - alertmanager_inhibitor_source_alerts_index_size - alertmanager_inhibitor_mutes_duration_seconds Add metrics for inhibition rules: - alertmanager_inhibit_rule_source_alerts_cache_size - alertmanager_inhibit_rule_source_alerts_index_size - alertmanager_inhibit_rule_matches_duration_seconds - alertmanager_inhibit_rule_mutes_duration_seconds Other changes: - Add warning for duplicate inhibition rule names - Add Len() method to store.Alerts struct - Add Len() method to inhibit.index struct Signed-off-by: Siavash Safi <[email protected]>
1 parent 1f2df03 commit 7439ab2

File tree

9 files changed

+815
-24
lines changed

9 files changed

+815
-24
lines changed

cmd/alertmanager/main.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ var (
104104
prometheus.GaugeOpts{
105105
Name: "alertmanager_inhibition_rules",
106106
Help: "Number of configured inhibition rules.",
107-
})
107+
},
108+
)
109+
108110
promslogConfig = promslog.Config{}
109111
)
110112

@@ -408,6 +410,7 @@ func run() int {
408410
)
409411

410412
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
413+
inhibitMetrics := inhibit.NewInhibitorMetrics(prometheus.DefaultRegisterer)
411414
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
412415
configLogger := logger.With("component", "configuration")
413416
configCoordinator := config.NewCoordinator(
@@ -462,7 +465,7 @@ func run() int {
462465
inhibitor.Stop()
463466
disp.Stop()
464467

465-
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
468+
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger, inhibitMetrics)
466469
silencer := silence.NewSilencer(silences, marker, logger)
467470

468471
// An interface value that holds a nil concrete value is non-nil.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ require (
3535
github.com/oklog/run v1.2.0
3636
github.com/oklog/ulid v1.3.1
3737
github.com/prometheus/client_golang v1.23.2
38+
github.com/prometheus/client_model v0.6.2
3839
github.com/prometheus/common v0.67.1
3940
github.com/prometheus/exporter-toolkit v0.14.1
4041
github.com/prometheus/sigv4 v0.2.1
@@ -104,7 +105,6 @@ require (
104105
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
105106
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
106107
github.com/pmezard/go-difflib v1.0.0 // indirect
107-
github.com/prometheus/client_model v0.6.2 // indirect
108108
github.com/prometheus/procfs v0.16.1 // indirect
109109
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
110110
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect

inhibit/index.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,10 @@ func (c *index) Delete(key model.Fingerprint) {
5555

5656
delete(c.items, key)
5757
}
58+
59+
func (c *index) Len() int {
60+
c.mtx.RLock()
61+
defer c.mtx.RUnlock()
62+
63+
return len(c.items)
64+
}

inhibit/inhibit.go

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"github.com/oklog/run"
23+
"github.com/prometheus/client_golang/prometheus"
2324
"github.com/prometheus/common/model"
2425

2526
"github.com/prometheus/alertmanager/config"
@@ -33,26 +34,45 @@ import (
3334
// currently active alerts and a set of inhibition rules. It implements the
3435
// Muter interface.
3536
type Inhibitor struct {
36-
alerts provider.Alerts
37-
rules []*InhibitRule
38-
marker types.AlertMarker
39-
logger *slog.Logger
37+
alerts provider.Alerts
38+
rules []*InhibitRule
39+
marker types.AlertMarker
40+
logger *slog.Logger
41+
metrics *InhibitorMetrics
4042

4143
mtx sync.RWMutex
4244
cancel func()
4345
}
4446

4547
// NewInhibitor returns a new Inhibitor.
46-
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
48+
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger, metrics *InhibitorMetrics) *Inhibitor {
4749
ih := &Inhibitor{
48-
alerts: ap,
49-
marker: mk,
50-
logger: logger,
50+
alerts: ap,
51+
marker: mk,
52+
logger: logger,
53+
metrics: metrics,
5154
}
52-
for _, cr := range rs {
53-
r := NewInhibitRule(cr)
55+
56+
ruleNames := make(map[string]int)
57+
for i, cr := range rs {
58+
if _, ok := ruleNames[cr.Name]; ok {
59+
ih.logger.Warn("duplicate inhibition rule name", "index", i, "name", cr.Name)
60+
}
61+
62+
r := NewInhibitRule(cr, NewRuleMetrics(cr.Name, metrics))
5463
ih.rules = append(ih.rules, r)
64+
65+
if cr.Name != "" {
66+
ruleNames[cr.Name]++
67+
}
5568
}
69+
70+
for name, count := range ruleNames {
71+
if count > 1 {
72+
metrics.duplicateRuleNames.With(prometheus.Labels{"name": name}).Set(float64(count))
73+
}
74+
}
75+
5676
return ih
5777
}
5878

@@ -70,16 +90,30 @@ func (ih *Inhibitor) run(ctx context.Context) {
7090
continue
7191
}
7292
// Update the inhibition rules' cache.
93+
cachedSum := 0
94+
indexedSum := 0
7395
for _, r := range ih.rules {
7496
if r.SourceMatchers.Matches(a.Labels) {
7597
if err := r.scache.Set(a); err != nil {
7698
ih.logger.Error("error on set alert", "err", err)
7799
continue
78100
}
79-
80101
r.updateIndex(a)
102+
103+
cached := r.scache.Len()
104+
indexed := r.sindex.Len()
105+
106+
if r.Name != "" {
107+
r.metrics.sourceAlertsCacheSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(cached))
108+
r.metrics.sourceAlertsIndexSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(indexed))
109+
}
110+
111+
cachedSum += cached
112+
indexedSum += indexed
81113
}
82114
}
115+
ih.metrics.sourceAlertsCacheSize.Set(float64(cachedSum))
116+
ih.metrics.sourceAlertsIndexSize.Set(float64(indexedSum))
83117
}
84118
}
85119
}
@@ -128,21 +162,29 @@ func (ih *Inhibitor) Stop() {
128162
// Mutes returns true iff the given label set is muted. It implements the Muter
129163
// interface.
130164
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
165+
start := time.Now()
131166
fp := lset.Fingerprint()
132167

133168
for _, r := range ih.rules {
169+
ruleStart := time.Now()
134170
if !r.TargetMatchers.Matches(lset) {
135171
// If target side of rule doesn't match, we don't need to look any further.
172+
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "false"}).Observe(time.Since(ruleStart).Seconds())
136173
continue
137174
}
175+
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "true"}).Observe(time.Since(ruleStart).Seconds())
138176
// If we are here, the target side matches. If the source side matches, too, we
139177
// need to exclude inhibiting alerts for which the same is true.
140178
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset)); eq {
141179
ih.marker.SetInhibited(fp, inhibitedByFP.String())
180+
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "true"}).Observe(time.Since(start).Seconds())
181+
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "true"}).Observe(time.Since(ruleStart).Seconds())
142182
return true
143183
}
184+
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "false"}).Observe(time.Since(ruleStart).Seconds())
144185
}
145186
ih.marker.SetInhibited(fp)
187+
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "false"}).Observe(time.Since(start).Seconds())
146188

147189
return false
148190
}
@@ -173,14 +215,17 @@ type InhibitRule struct {
173215
// The index items might overwrite eachother if multiple source alerts have exact equal labels.
174216
// Overwrites only happen if the new source alert has bigger EndsAt value.
175217
sindex *index
218+
219+
metrics *RuleMetrics
176220
}
177221

178222
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
179-
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
223+
func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
180224
var (
181225
sourcem labels.Matchers
182226
targetm labels.Matchers
183227
)
228+
184229
// cr.SourceMatch will be deprecated. This for loop appends regex matchers.
185230
for ln, lv := range cr.SourceMatch {
186231
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
@@ -235,6 +280,7 @@ func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
235280
Equal: equal,
236281
scache: store.NewAlerts(),
237282
sindex: newIndex(),
283+
metrics: metrics,
238284
}
239285

240286
rule.scache.SetGCCallback(rule.gcCallback)
@@ -310,6 +356,10 @@ func (r *InhibitRule) gcCallback(alerts []types.Alert) {
310356
fp := r.fingerprintEquals(a.Labels)
311357
r.sindex.Delete(fp)
312358
}
359+
if r.Name != "" {
360+
r.metrics.sourceAlertsCacheSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.scache.Len()))
361+
r.metrics.sourceAlertsIndexSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.sindex.Len()))
362+
}
313363
}
314364

315365
// hasEqual checks whether the source cache contains alerts matching the equal

inhibit/inhibit_bench_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
198198
}
199199
}
200200

201-
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger())
201+
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger(), NewInhibitorMetrics(r))
202202
defer ih.Stop()
203203
go ih.Run()
204204

inhibit/inhibit_test.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ func TestInhibitRuleHasEqual(t *testing.T) {
125125
for _, c := range cases {
126126
t.Run(c.name, func(t *testing.T) {
127127
r := &InhibitRule{
128-
Equal: map[model.LabelName]struct{}{},
129-
scache: store.NewAlerts(),
130-
sindex: newIndex(),
128+
Equal: map[model.LabelName]struct{}{},
129+
scache: store.NewAlerts(),
130+
sindex: newIndex(),
131+
metrics: NewRuleMetrics("test", NewInhibitorMetrics(prometheus.NewRegistry())),
131132
}
132133
for _, ln := range c.equal {
133134
r.Equal[ln] = struct{}{}
@@ -159,7 +160,7 @@ func TestInhibitRuleMatches(t *testing.T) {
159160
}
160161

161162
m := types.NewMarker(prometheus.NewRegistry())
162-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
163+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
163164
now := time.Now()
164165
// Active alert that matches the source filter of rule1.
165166
sourceAlert1 := &types.Alert{
@@ -260,7 +261,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
260261
}
261262

262263
m := types.NewMarker(prometheus.NewRegistry())
263-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
264+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
264265
now := time.Now()
265266
// Active alert that matches the source filter of rule1.
266267
sourceAlert1 := &types.Alert{
@@ -369,8 +370,8 @@ func TestInhibitRuleName(t *testing.T) {
369370
Equal: []string{"instance"},
370371
}
371372

372-
rule1 := NewInhibitRule(config1)
373-
rule2 := NewInhibitRule(config2)
373+
rule1 := NewInhibitRule(config1, nil)
374+
rule2 := NewInhibitRule(config2, nil)
374375

375376
require.Equal(t, "test-rule", rule1.Name, "Expected named rule to have adopt name from config")
376377
require.Empty(t, rule2.Name, "Expected unnamed rule to have empty name")
@@ -498,7 +499,7 @@ func TestInhibit(t *testing.T) {
498499
} {
499500
ap := newFakeAlerts(tc.alerts)
500501
mk := types.NewMarker(prometheus.NewRegistry())
501-
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger)
502+
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
502503

503504
go func() {
504505
for ap.finished != nil {

0 commit comments

Comments
 (0)