Skip to content

Commit 2851b0d

Browse files
committed
feat(inhibit): add inhibition metrics
Add metrics for inhibition rules: - alertmanager_inhibitor_source_alerts_cache_size - alertmanager_inhibitor_source_alerts_index_size - alertmanager_inhibitor_rule_matches_duration_seconds - alertmanager_inhibitor_rule_mutes_duration_seconds Named rules get dedicated metric dimensions. All rules are counted agains global metrics (rule="all"). If no rule mutes an alert the rule="none" dimension is used. Other changes: - Add Len() method to store.Alerts struct - Add Len() method to inhibit.index struct Signed-off-by: Siavash Safi <[email protected]>
1 parent 1f2df03 commit 2851b0d

File tree

8 files changed

+666
-22
lines changed

8 files changed

+666
-22
lines changed

cmd/alertmanager/main.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ var (
104104
prometheus.GaugeOpts{
105105
Name: "alertmanager_inhibition_rules",
106106
Help: "Number of configured inhibition rules.",
107-
})
107+
},
108+
)
109+
108110
promslogConfig = promslog.Config{}
109111
)
110112

@@ -408,6 +410,7 @@ func run() int {
408410
)
409411

410412
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
413+
inhibitMetrics := inhibit.NewInhibitorMetrics(prometheus.DefaultRegisterer)
411414
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
412415
configLogger := logger.With("component", "configuration")
413416
configCoordinator := config.NewCoordinator(
@@ -462,7 +465,7 @@ func run() int {
462465
inhibitor.Stop()
463466
disp.Stop()
464467

465-
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
468+
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger, inhibitMetrics)
466469
silencer := silence.NewSilencer(silences, marker, logger)
467470

468471
// An interface value that holds a nil concrete value is non-nil.

inhibit/index.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,10 @@ func (c *index) Delete(key model.Fingerprint) {
5555

5656
delete(c.items, key)
5757
}
58+
59+
func (c *index) Len() int {
60+
c.mtx.RLock()
61+
defer c.mtx.RUnlock()
62+
63+
return len(c.items)
64+
}

inhibit/inhibit.go

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,24 +33,27 @@ import (
3333
// currently active alerts and a set of inhibition rules. It implements the
3434
// Muter interface.
3535
type Inhibitor struct {
36-
alerts provider.Alerts
37-
rules []*InhibitRule
38-
marker types.AlertMarker
39-
logger *slog.Logger
36+
alerts provider.Alerts
37+
rules []*InhibitRule
38+
marker types.AlertMarker
39+
logger *slog.Logger
40+
metrics *InhibitorMetrics
4041

4142
mtx sync.RWMutex
4243
cancel func()
4344
}
4445

4546
// NewInhibitor returns a new Inhibitor.
46-
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
47+
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger, metrics *InhibitorMetrics) *Inhibitor {
4748
ih := &Inhibitor{
48-
alerts: ap,
49-
marker: mk,
50-
logger: logger,
49+
alerts: ap,
50+
marker: mk,
51+
logger: logger,
52+
metrics: metrics,
5153
}
54+
5255
for _, cr := range rs {
53-
r := NewInhibitRule(cr)
56+
r := NewInhibitRule(cr, NewRuleMetrics(cr.Name, metrics))
5457
ih.rules = append(ih.rules, r)
5558
}
5659
return ih
@@ -70,16 +73,22 @@ func (ih *Inhibitor) run(ctx context.Context) {
7073
continue
7174
}
7275
// Update the inhibition rules' cache.
76+
cached := 0
77+
indexed := 0
7378
for _, r := range ih.rules {
7479
if r.SourceMatchers.Matches(a.Labels) {
7580
if err := r.scache.Set(a); err != nil {
7681
ih.logger.Error("error on set alert", "err", err)
7782
continue
7883
}
79-
8084
r.updateIndex(a)
85+
cached += r.scache.Len()
86+
indexed += r.sindex.Len()
87+
r.metrics.ObserveSourceAlertsCacheSize(cached)
8188
}
8289
}
90+
ih.metrics.ObserveSourceAlertsIndexSize(indexed)
91+
ih.metrics.ObserveSourceAlertsCacheSize(cached)
8392
}
8493
}
8594
}
@@ -128,21 +137,28 @@ func (ih *Inhibitor) Stop() {
128137
// Mutes returns true iff the given label set is muted. It implements the Muter
129138
// interface.
130139
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
140+
start := time.Now()
131141
fp := lset.Fingerprint()
132142

133143
for _, r := range ih.rules {
144+
ruleStart := time.Now()
134145
if !r.TargetMatchers.Matches(lset) {
135146
// If target side of rule doesn't match, we don't need to look any further.
147+
r.metrics.ObserveRuleMatchesDuration("false", time.Since(ruleStart))
136148
continue
137149
}
150+
r.metrics.ObserveRuleMatchesDuration("true", time.Since(ruleStart))
138151
// If we are here, the target side matches. If the source side matches, too, we
139152
// need to exclude inhibiting alerts for which the same is true.
140153
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset)); eq {
141154
ih.marker.SetInhibited(fp, inhibitedByFP.String())
155+
r.metrics.ObserveRuleMutesDuration("true", time.Since(ruleStart))
142156
return true
143157
}
158+
r.metrics.ObserveRuleMutesDuration("false", time.Since(ruleStart))
144159
}
145160
ih.marker.SetInhibited(fp)
161+
ih.metrics.ObserveRuleMutesDuration("none", "false", time.Since(start))
146162

147163
return false
148164
}
@@ -173,14 +189,17 @@ type InhibitRule struct {
173189
// The index items might overwrite eachother if multiple source alerts have exact equal labels.
174190
// Overwrites only happen if the new source alert has bigger EndsAt value.
175191
sindex *index
192+
193+
metrics *RuleMetrics
176194
}
177195

178196
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
179-
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
197+
func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
180198
var (
181199
sourcem labels.Matchers
182200
targetm labels.Matchers
183201
)
202+
184203
// cr.SourceMatch will be deprecated. This for loop appends regex matchers.
185204
for ln, lv := range cr.SourceMatch {
186205
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
@@ -235,6 +254,7 @@ func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
235254
Equal: equal,
236255
scache: store.NewAlerts(),
237256
sindex: newIndex(),
257+
metrics: metrics,
238258
}
239259

240260
rule.scache.SetGCCallback(rule.gcCallback)
@@ -262,6 +282,7 @@ func (r *InhibitRule) updateIndex(alert *types.Alert) {
262282
if !ok {
263283
// If not, add it.
264284
r.sindex.Set(eq, fp)
285+
r.metrics.ObserveSourceAlertsIndexSize(r.sindex.Len())
265286
return
266287
}
267288
// If the indexed fingerprint is the same as the new fingerprint, do nothing.
@@ -274,12 +295,14 @@ func (r *InhibitRule) updateIndex(alert *types.Alert) {
274295
if err != nil {
275296
// failed to get the existing alert, overwrite the index.
276297
r.sindex.Set(eq, fp)
298+
r.metrics.ObserveSourceAlertsIndexSize(r.sindex.Len())
277299
return
278300
}
279301

280302
// If the new alert resolves after the existing alert, replace the index.
281303
if existing.ResolvedAt(alert.EndsAt) {
282304
r.sindex.Set(eq, fp)
305+
r.metrics.ObserveSourceAlertsIndexSize(r.sindex.Len())
283306
return
284307
}
285308
// If the existing alert resolves after the new alert, do nothing.
@@ -310,6 +333,8 @@ func (r *InhibitRule) gcCallback(alerts []types.Alert) {
310333
fp := r.fingerprintEquals(a.Labels)
311334
r.sindex.Delete(fp)
312335
}
336+
r.metrics.ObserveSourceAlertsCacheSize(r.scache.Len())
337+
r.metrics.ObserveSourceAlertsIndexSize(r.sindex.Len())
313338
}
314339

315340
// hasEqual checks whether the source cache contains alerts matching the equal

inhibit/inhibit_bench_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
198198
}
199199
}
200200

201-
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger())
201+
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger(), NewInhibitorMetrics(r))
202202
defer ih.Stop()
203203
go ih.Run()
204204

inhibit/inhibit_test.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ func TestInhibitRuleHasEqual(t *testing.T) {
125125
for _, c := range cases {
126126
t.Run(c.name, func(t *testing.T) {
127127
r := &InhibitRule{
128-
Equal: map[model.LabelName]struct{}{},
129-
scache: store.NewAlerts(),
130-
sindex: newIndex(),
128+
Equal: map[model.LabelName]struct{}{},
129+
scache: store.NewAlerts(),
130+
sindex: newIndex(),
131+
metrics: NewRuleMetrics("test", NewInhibitorMetrics(prometheus.NewRegistry())),
131132
}
132133
for _, ln := range c.equal {
133134
r.Equal[ln] = struct{}{}
@@ -159,7 +160,7 @@ func TestInhibitRuleMatches(t *testing.T) {
159160
}
160161

161162
m := types.NewMarker(prometheus.NewRegistry())
162-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
163+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
163164
now := time.Now()
164165
// Active alert that matches the source filter of rule1.
165166
sourceAlert1 := &types.Alert{
@@ -260,7 +261,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
260261
}
261262

262263
m := types.NewMarker(prometheus.NewRegistry())
263-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
264+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
264265
now := time.Now()
265266
// Active alert that matches the source filter of rule1.
266267
sourceAlert1 := &types.Alert{
@@ -369,8 +370,8 @@ func TestInhibitRuleName(t *testing.T) {
369370
Equal: []string{"instance"},
370371
}
371372

372-
rule1 := NewInhibitRule(config1)
373-
rule2 := NewInhibitRule(config2)
373+
rule1 := NewInhibitRule(config1, nil)
374+
rule2 := NewInhibitRule(config2, nil)
374375

375376
require.Equal(t, "test-rule", rule1.Name, "Expected named rule to have adopt name from config")
376377
require.Empty(t, rule2.Name, "Expected unnamed rule to have empty name")
@@ -498,7 +499,7 @@ func TestInhibit(t *testing.T) {
498499
} {
499500
ap := newFakeAlerts(tc.alerts)
500501
mk := types.NewMarker(prometheus.NewRegistry())
501-
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger)
502+
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
502503

503504
go func() {
504505
for ap.finished != nil {

inhibit/metric.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package inhibit
15+
16+
import (
17+
"time"
18+
19+
"github.com/prometheus/client_golang/prometheus"
20+
)
21+
22+
// InhibitorMetrics represents metrics associated to an inhibitor.
23+
type InhibitorMetrics struct {
24+
sourceAlertsCacheSize *prometheus.GaugeVec
25+
sourceAlertsIndexSize *prometheus.GaugeVec
26+
ruleMatchesDuration *prometheus.SummaryVec
27+
ruleMuteDuration *prometheus.SummaryVec
28+
}
29+
30+
// NewInhibitorMetrics returns a new InhibitorMetrics.
31+
func NewInhibitorMetrics(reg prometheus.Registerer) *InhibitorMetrics {
32+
metrics := &InhibitorMetrics{
33+
sourceAlertsCacheSize: prometheus.NewGaugeVec(
34+
prometheus.GaugeOpts{
35+
Name: "alertmanager_inhibitor_source_alerts_cache_size",
36+
Help: "Size of the source alerts cache in inhibition rules.",
37+
},
38+
[]string{"rule"},
39+
),
40+
sourceAlertsIndexSize: prometheus.NewGaugeVec(
41+
prometheus.GaugeOpts{
42+
Name: "alertmanager_inhibitor_source_alerts_index_size",
43+
Help: "Size of the source alerts index in inhibition rules.",
44+
},
45+
[]string{"rule"},
46+
),
47+
ruleMatchesDuration: prometheus.NewSummaryVec(
48+
prometheus.SummaryOpts{
49+
Name: "alertmanager_inhibitor_rule_matches_duration_seconds",
50+
Help: "Summary of latencies for the matching of alerts by inhibition rules.",
51+
},
52+
[]string{"rule", "matched"},
53+
),
54+
ruleMuteDuration: prometheus.NewSummaryVec(
55+
prometheus.SummaryOpts{
56+
Name: "alertmanager_inhibitor_rule_mutes_duration_seconds",
57+
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
58+
},
59+
[]string{"rule", "muted"},
60+
),
61+
}
62+
if reg != nil {
63+
reg.MustRegister(
64+
metrics.sourceAlertsCacheSize,
65+
metrics.sourceAlertsIndexSize,
66+
metrics.ruleMatchesDuration,
67+
metrics.ruleMuteDuration,
68+
)
69+
}
70+
71+
metrics.ObserveSourceAlertsCacheSize(0)
72+
metrics.ObserveSourceAlertsIndexSize(0)
73+
74+
return metrics
75+
}
76+
77+
func (m *InhibitorMetrics) ObserveSourceAlertsCacheSize(size int) {
78+
m.sourceAlertsCacheSize.With(prometheus.Labels{"rule": "all"}).Set(float64(size))
79+
}
80+
81+
func (m *InhibitorMetrics) ObserveSourceAlertsIndexSize(size int) {
82+
m.sourceAlertsIndexSize.With(prometheus.Labels{"rule": "all"}).Set(float64(size))
83+
}
84+
85+
func (m *InhibitorMetrics) ObserveRuleMutesDuration(rule, muted string, duration time.Duration) {
86+
m.ruleMuteDuration.With(prometheus.Labels{"rule": rule, "muted": muted}).Observe(duration.Seconds())
87+
}
88+
89+
type RuleMetrics struct {
90+
ruleName string
91+
matchesDuration *prometheus.SummaryVec
92+
mutesDuration *prometheus.SummaryVec
93+
sourceAlertsCacheSize *prometheus.GaugeVec
94+
sourceAlertsIndexSize *prometheus.GaugeVec
95+
}
96+
97+
func NewRuleMetrics(name string, metrics *InhibitorMetrics) *RuleMetrics {
98+
rm := &RuleMetrics{
99+
ruleName: name,
100+
matchesDuration: metrics.ruleMatchesDuration,
101+
mutesDuration: metrics.ruleMuteDuration,
102+
sourceAlertsCacheSize: metrics.sourceAlertsCacheSize,
103+
sourceAlertsIndexSize: metrics.sourceAlertsIndexSize,
104+
}
105+
rm.ObserveSourceAlertsCacheSize(0)
106+
rm.ObserveSourceAlertsIndexSize(0)
107+
return rm
108+
}
109+
110+
func (rm *RuleMetrics) ObserveSourceAlertsCacheSize(size int) {
111+
// If rule name is empty, skip per rule gauge metrics
112+
if rm.ruleName == "" {
113+
return
114+
}
115+
rm.sourceAlertsCacheSize.With(prometheus.Labels{"rule": rm.ruleName}).Set(float64(size))
116+
}
117+
118+
func (rm *RuleMetrics) ObserveSourceAlertsIndexSize(size int) {
119+
// If rule name is empty, skip per rule gauge metrics
120+
if rm.ruleName == "" {
121+
return
122+
}
123+
rm.sourceAlertsIndexSize.With(prometheus.Labels{"rule": rm.ruleName}).Set(float64(size))
124+
}
125+
126+
func (rm *RuleMetrics) ObserveRuleMatchesDuration(matched string, duration time.Duration) {
127+
rm.matchesDuration.With(prometheus.Labels{"rule": rm.ruleName, "matched": matched}).Observe(duration.Seconds())
128+
}
129+
130+
func (rm *RuleMetrics) ObserveRuleMutesDuration(muted string, duration time.Duration) {
131+
rm.mutesDuration.With(prometheus.Labels{"rule": rm.ruleName, "muted": muted}).Observe(duration.Seconds())
132+
}

0 commit comments

Comments
 (0)