Skip to content

Commit 7d259e1

Browse files
committed
feat(inhibit): add inhibition metrics
Add metrics for inhibitor: - alertmanager_inhibitor_source_alerts_cache_size - alertmanager_inhibitor_source_alerts_index_size - alertmanager_inhibitor_mutes_duration_seconds Add metrics for inhibition rules: - alertmanager_inhibit_rule_source_alerts_cache_size - alertmanager_inhibit_rule_source_alerts_index_size - alertmanager_inhibit_rule_matches_duration_seconds - alertmanager_inhibit_rule_mutes_duration_seconds Other changes: - Add warning for duplicate inhibition rule names - Add Len() method to store.Alerts struct - Add Len() method to inhibit.index struct Signed-off-by: Siavash Safi <[email protected]>
1 parent 1f2df03 commit 7d259e1

File tree

9 files changed

+711
-24
lines changed

9 files changed

+711
-24
lines changed

cmd/alertmanager/main.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ var (
104104
prometheus.GaugeOpts{
105105
Name: "alertmanager_inhibition_rules",
106106
Help: "Number of configured inhibition rules.",
107-
})
107+
},
108+
)
109+
108110
promslogConfig = promslog.Config{}
109111
)
110112

@@ -408,6 +410,7 @@ func run() int {
408410
)
409411

410412
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
413+
inhibitMetrics := inhibit.NewInhibitorMetrics(prometheus.DefaultRegisterer)
411414
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
412415
configLogger := logger.With("component", "configuration")
413416
configCoordinator := config.NewCoordinator(
@@ -462,7 +465,7 @@ func run() int {
462465
inhibitor.Stop()
463466
disp.Stop()
464467

465-
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
468+
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger, inhibitMetrics)
466469
silencer := silence.NewSilencer(silences, marker, logger)
467470

468471
// An interface value that holds a nil concrete value is non-nil.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ require (
3535
github.com/oklog/run v1.2.0
3636
github.com/oklog/ulid v1.3.1
3737
github.com/prometheus/client_golang v1.23.2
38+
github.com/prometheus/client_model v0.6.2
3839
github.com/prometheus/common v0.67.1
3940
github.com/prometheus/exporter-toolkit v0.14.1
4041
github.com/prometheus/sigv4 v0.2.1
@@ -104,7 +105,6 @@ require (
104105
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
105106
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
106107
github.com/pmezard/go-difflib v1.0.0 // indirect
107-
github.com/prometheus/client_model v0.6.2 // indirect
108108
github.com/prometheus/procfs v0.16.1 // indirect
109109
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
110110
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect

inhibit/index.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,10 @@ func (c *index) Delete(key model.Fingerprint) {
5555

5656
delete(c.items, key)
5757
}
58+
59+
func (c *index) Len() int {
60+
c.mtx.RLock()
61+
defer c.mtx.RUnlock()
62+
63+
return len(c.items)
64+
}

inhibit/inhibit.go

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"github.com/oklog/run"
23+
"github.com/prometheus/client_golang/prometheus"
2324
"github.com/prometheus/common/model"
2425

2526
"github.com/prometheus/alertmanager/config"
@@ -33,25 +34,37 @@ import (
3334
// currently active alerts and a set of inhibition rules. It implements the
3435
// Muter interface.
3536
type Inhibitor struct {
36-
alerts provider.Alerts
37-
rules []*InhibitRule
38-
marker types.AlertMarker
39-
logger *slog.Logger
37+
alerts provider.Alerts
38+
rules []*InhibitRule
39+
marker types.AlertMarker
40+
logger *slog.Logger
41+
metrics *InhibitorMetrics
4042

4143
mtx sync.RWMutex
4244
cancel func()
4345
}
4446

4547
// NewInhibitor returns a new Inhibitor.
46-
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
48+
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger, metrics *InhibitorMetrics) *Inhibitor {
4749
ih := &Inhibitor{
48-
alerts: ap,
49-
marker: mk,
50-
logger: logger,
50+
alerts: ap,
51+
marker: mk,
52+
logger: logger,
53+
metrics: metrics,
5154
}
52-
for _, cr := range rs {
53-
r := NewInhibitRule(cr)
55+
56+
ruleNames := make(map[string]struct{})
57+
for i, cr := range rs {
58+
if _, ok := ruleNames[cr.Name]; ok {
59+
ih.logger.Warn("duplicate inhibition rule name", "index", i, "name", cr.Name)
60+
}
61+
62+
r := NewInhibitRule(cr, NewRuleMetrics(cr.Name, metrics))
5463
ih.rules = append(ih.rules, r)
64+
65+
if cr.Name != "" {
66+
ruleNames[cr.Name] = struct{}{}
67+
}
5568
}
5669
return ih
5770
}
@@ -70,16 +83,30 @@ func (ih *Inhibitor) run(ctx context.Context) {
7083
continue
7184
}
7285
// Update the inhibition rules' cache.
86+
cachedSum := 0
87+
indexedSum := 0
7388
for _, r := range ih.rules {
7489
if r.SourceMatchers.Matches(a.Labels) {
7590
if err := r.scache.Set(a); err != nil {
7691
ih.logger.Error("error on set alert", "err", err)
7792
continue
7893
}
79-
8094
r.updateIndex(a)
95+
96+
cached := r.scache.Len()
97+
indexed := r.sindex.Len()
98+
99+
if r.Name != "" {
100+
r.metrics.sourceAlertsCacheSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(cached))
101+
r.metrics.sourceAlertsIndexSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(indexed))
102+
}
103+
104+
cachedSum += cached
105+
indexedSum += indexed
81106
}
82107
}
108+
ih.metrics.sourceAlertsCacheSize.Set(float64(cachedSum))
109+
ih.metrics.sourceAlertsIndexSize.Set(float64(indexedSum))
83110
}
84111
}
85112
}
@@ -128,21 +155,29 @@ func (ih *Inhibitor) Stop() {
128155
// Mutes returns true iff the given label set is muted. It implements the Muter
129156
// interface.
130157
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
158+
start := time.Now()
131159
fp := lset.Fingerprint()
132160

133161
for _, r := range ih.rules {
162+
ruleStart := time.Now()
134163
if !r.TargetMatchers.Matches(lset) {
135164
// If target side of rule doesn't match, we don't need to look any further.
165+
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "false"}).Observe(time.Since(ruleStart).Seconds())
136166
continue
137167
}
168+
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "true"}).Observe(time.Since(ruleStart).Seconds())
138169
// If we are here, the target side matches. If the source side matches, too, we
139170
// need to exclude inhibiting alerts for which the same is true.
140171
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset)); eq {
141172
ih.marker.SetInhibited(fp, inhibitedByFP.String())
173+
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "true"}).Observe(time.Since(start).Seconds())
174+
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "true"}).Observe(time.Since(ruleStart).Seconds())
142175
return true
143176
}
177+
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "false"}).Observe(time.Since(ruleStart).Seconds())
144178
}
145179
ih.marker.SetInhibited(fp)
180+
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "false"}).Observe(time.Since(start).Seconds())
146181

147182
return false
148183
}
@@ -173,14 +208,17 @@ type InhibitRule struct {
173208
// The index items might overwrite eachother if multiple source alerts have exact equal labels.
174209
// Overwrites only happen if the new source alert has bigger EndsAt value.
175210
sindex *index
211+
212+
metrics *RuleMetrics
176213
}
177214

178215
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
179-
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
216+
func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
180217
var (
181218
sourcem labels.Matchers
182219
targetm labels.Matchers
183220
)
221+
184222
// cr.SourceMatch will be deprecated. This for loop appends regex matchers.
185223
for ln, lv := range cr.SourceMatch {
186224
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
@@ -235,6 +273,7 @@ func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
235273
Equal: equal,
236274
scache: store.NewAlerts(),
237275
sindex: newIndex(),
276+
metrics: metrics,
238277
}
239278

240279
rule.scache.SetGCCallback(rule.gcCallback)
@@ -310,6 +349,10 @@ func (r *InhibitRule) gcCallback(alerts []types.Alert) {
310349
fp := r.fingerprintEquals(a.Labels)
311350
r.sindex.Delete(fp)
312351
}
352+
if r.Name != "" {
353+
r.metrics.sourceAlertsCacheSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.scache.Len()))
354+
r.metrics.sourceAlertsIndexSize.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.sindex.Len()))
355+
}
313356
}
314357

315358
// hasEqual checks whether the source cache contains alerts matching the equal

inhibit/inhibit_bench_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
198198
}
199199
}
200200

201-
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger())
201+
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger(), NewInhibitorMetrics(r))
202202
defer ih.Stop()
203203
go ih.Run()
204204

inhibit/inhibit_test.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ func TestInhibitRuleHasEqual(t *testing.T) {
125125
for _, c := range cases {
126126
t.Run(c.name, func(t *testing.T) {
127127
r := &InhibitRule{
128-
Equal: map[model.LabelName]struct{}{},
129-
scache: store.NewAlerts(),
130-
sindex: newIndex(),
128+
Equal: map[model.LabelName]struct{}{},
129+
scache: store.NewAlerts(),
130+
sindex: newIndex(),
131+
metrics: NewRuleMetrics("test", NewInhibitorMetrics(prometheus.NewRegistry())),
131132
}
132133
for _, ln := range c.equal {
133134
r.Equal[ln] = struct{}{}
@@ -159,7 +160,7 @@ func TestInhibitRuleMatches(t *testing.T) {
159160
}
160161

161162
m := types.NewMarker(prometheus.NewRegistry())
162-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
163+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
163164
now := time.Now()
164165
// Active alert that matches the source filter of rule1.
165166
sourceAlert1 := &types.Alert{
@@ -260,7 +261,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
260261
}
261262

262263
m := types.NewMarker(prometheus.NewRegistry())
263-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
264+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
264265
now := time.Now()
265266
// Active alert that matches the source filter of rule1.
266267
sourceAlert1 := &types.Alert{
@@ -369,8 +370,8 @@ func TestInhibitRuleName(t *testing.T) {
369370
Equal: []string{"instance"},
370371
}
371372

372-
rule1 := NewInhibitRule(config1)
373-
rule2 := NewInhibitRule(config2)
373+
rule1 := NewInhibitRule(config1, nil)
374+
rule2 := NewInhibitRule(config2, nil)
374375

375376
require.Equal(t, "test-rule", rule1.Name, "Expected named rule to have adopt name from config")
376377
require.Empty(t, rule2.Name, "Expected unnamed rule to have empty name")
@@ -498,7 +499,7 @@ func TestInhibit(t *testing.T) {
498499
} {
499500
ap := newFakeAlerts(tc.alerts)
500501
mk := types.NewMarker(prometheus.NewRegistry())
501-
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger)
502+
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
502503

503504
go func() {
504505
for ap.finished != nil {

inhibit/metric.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package inhibit
15+
16+
import (
17+
"github.com/prometheus/client_golang/prometheus"
18+
)
19+
20+
// InhibitorMetrics represents metrics associated to an inhibitor.
21+
type InhibitorMetrics struct {
22+
// Inhibitor metrics
23+
sourceAlertsCacheSize prometheus.Gauge
24+
sourceAlertsIndexSize prometheus.Gauge
25+
mutesDuration *prometheus.SummaryVec
26+
27+
// Rule metrics
28+
ruleSourceAlertsCacheSize *prometheus.GaugeVec
29+
ruleSourceAlertsIndexSize *prometheus.GaugeVec
30+
ruleMatchesDuration *prometheus.SummaryVec
31+
ruleMutesDuration *prometheus.SummaryVec
32+
}
33+
34+
// NewInhibitorMetrics returns a new InhibitorMetrics.
35+
func NewInhibitorMetrics(reg prometheus.Registerer) *InhibitorMetrics {
36+
metrics := &InhibitorMetrics{
37+
sourceAlertsCacheSize: prometheus.NewGauge(
38+
prometheus.GaugeOpts{
39+
Name: "alertmanager_inhibitor_source_alerts_cache_size",
40+
Help: "Size of the source alerts cache in inhibition rules.",
41+
},
42+
),
43+
sourceAlertsIndexSize: prometheus.NewGauge(
44+
prometheus.GaugeOpts{
45+
Name: "alertmanager_inhibitor_source_alerts_index_size",
46+
Help: "Size of the source alerts index in inhibition rules.",
47+
},
48+
),
49+
mutesDuration: prometheus.NewSummaryVec(
50+
prometheus.SummaryOpts{
51+
Name: "alertmanager_inhibitor_mutes_duration_seconds",
52+
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
53+
},
54+
[]string{"muted"},
55+
),
56+
ruleSourceAlertsCacheSize: prometheus.NewGaugeVec(
57+
prometheus.GaugeOpts{
58+
Name: "alertmanager_inhibit_rule_source_alerts_cache_size",
59+
Help: "Size of the source alerts cache in inhibition rules.",
60+
},
61+
[]string{"rule"},
62+
),
63+
ruleSourceAlertsIndexSize: prometheus.NewGaugeVec(
64+
prometheus.GaugeOpts{
65+
Name: "alertmanager_inhibit_rule_source_alerts_index_size",
66+
Help: "Size of the source alerts index in inhibition rules.",
67+
},
68+
[]string{"rule"},
69+
),
70+
ruleMatchesDuration: prometheus.NewSummaryVec(
71+
prometheus.SummaryOpts{
72+
Name: "alertmanager_inhibit_rule_matches_duration_seconds",
73+
Help: "Summary of latencies for the matching of alerts by inhibition rules.",
74+
},
75+
[]string{"rule", "matched"},
76+
),
77+
ruleMutesDuration: prometheus.NewSummaryVec(
78+
prometheus.SummaryOpts{
79+
Name: "alertmanager_inhibit_rule_mutes_duration_seconds",
80+
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
81+
},
82+
[]string{"rule", "muted"},
83+
),
84+
}
85+
if reg != nil {
86+
reg.MustRegister(
87+
metrics.sourceAlertsCacheSize,
88+
metrics.sourceAlertsIndexSize,
89+
metrics.mutesDuration,
90+
metrics.ruleSourceAlertsCacheSize,
91+
metrics.ruleSourceAlertsIndexSize,
92+
metrics.ruleMatchesDuration,
93+
metrics.ruleMutesDuration,
94+
)
95+
}
96+
97+
metrics.sourceAlertsCacheSize.Set(0)
98+
metrics.sourceAlertsIndexSize.Set(0)
99+
100+
return metrics
101+
}
102+
103+
type RuleMetrics struct {
104+
ruleName string
105+
matchesDuration *prometheus.SummaryVec
106+
mutesDuration *prometheus.SummaryVec
107+
sourceAlertsCacheSize *prometheus.GaugeVec
108+
sourceAlertsIndexSize *prometheus.GaugeVec
109+
}
110+
111+
func NewRuleMetrics(name string, metrics *InhibitorMetrics) *RuleMetrics {
112+
rm := &RuleMetrics{
113+
ruleName: name,
114+
matchesDuration: metrics.ruleMatchesDuration,
115+
mutesDuration: metrics.ruleMutesDuration,
116+
sourceAlertsCacheSize: metrics.ruleSourceAlertsCacheSize,
117+
sourceAlertsIndexSize: metrics.ruleSourceAlertsIndexSize,
118+
}
119+
120+
rm.sourceAlertsCacheSize.With(prometheus.Labels{"rule": rm.ruleName}).Set(0)
121+
rm.sourceAlertsIndexSize.With(prometheus.Labels{"rule": rm.ruleName}).Set(0)
122+
123+
return rm
124+
}

0 commit comments

Comments
 (0)