Skip to content

Commit 710cd34

Browse files
emanlodoviceyeya24
authored andcommitted
Remove APIEnableRulesBackup config and use Ring.ReplicationFactor ins… (#5901)
* Remove APIEnableRulesBackup config and use Ring.ReplicationFactor instead Signed-off-by: Emmanuel Lodovice <[email protected]> * Update changelog Signed-off-by: Emmanuel Lodovice <[email protected]> * Update change log Signed-off-by: Emmanuel Lodovice <[email protected]> * Consider AZ awareness disabled if only 1 zone is present in list rules Signed-off-by: Emmanuel Lodovice <[email protected]> --------- Signed-off-by: Emmanuel Lodovice <[email protected]>
1 parent 059a936 commit 710cd34

File tree

9 files changed

+111
-76
lines changed

9 files changed

+111
-76
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## master / unreleased
4+
* [CHANGE] Ruler: Remove `experimental.ruler.api-enable-rules-backup` flag and use `ruler.ring.replication-factor` to check if rules backup is enabled
45

56
## 1.17.0 in progress
67

docs/configuration/config-file-reference.md

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4271,15 +4271,6 @@ ring:
42714271
# CLI flag: -experimental.ruler.enable-api
42724272
[enable_api: <boolean> | default = false]
42734273
4274-
# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers
4275-
# with default state (state before any evaluation) and send this copy in list
4276-
# API requests as backup in case the ruler who owns the rule fails to send its
4277-
# rules. This allows the rules API to handle ruler outage by returning rules
4278-
# with default state. Ring replication-factor needs to be set to 2 or more for
4279-
# this to be useful.
4280-
# CLI flag: -experimental.ruler.api-enable-rules-backup
4281-
[api_enable_rules_backup: <boolean> | default = false]
4282-
42834274
# EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API
42844275
# response. If there are duplicate rules the rule with the latest evaluation
42854276
# timestamp will be kept.

integration/ruler_test.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) {
402402
testRulerAPIWithSharding(t, true)
403403
}
404404

405-
func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
405+
func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) {
406406
const numRulesGroups = 100
407407

408408
random := rand.New(rand.NewSource(time.Now().UnixNano()))
@@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
459459
// Enable the bucket index so we can skip the initial bucket scan.
460460
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
461461
}
462-
if enableAPIRulesBackup {
462+
if enableRulesBackup {
463463
overrides["-ruler.ring.replication-factor"] = "3"
464-
overrides["-experimental.ruler.api-enable-rules-backup"] = "true"
465464
}
466465
rulerFlags := mergeFlags(
467466
BlocksStorageFlags(),
@@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
556555
},
557556
}
558557
// For each test case, fetch the rules with configured filters, and ensure the results match.
559-
if enableAPIRulesBackup {
560-
err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down
558+
if enableRulesBackup {
559+
err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down
561560
require.NoError(t, err)
562561
}
563562
for name, tc := range testCases {

pkg/ruler/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva
116116
registry: reg,
117117
logger: logger,
118118
}
119-
if cfg.APIEnableRulesBackup {
119+
if cfg.RulesBackupEnabled() {
120120
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg)
121121
}
122122
return m, nil

pkg/ruler/manager_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) {
262262
1 * time.Millisecond,
263263
}
264264
ruleManagerFactory := RuleManagerFactory(nil, waitDurations)
265-
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
265+
config := Config{RulePath: dir}
266+
config.Ring.ReplicationFactor = 3
267+
m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
266268
require.NoError(t, err)
267269

268270
const user1 = "testUser"

pkg/ruler/ruler.go

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,8 @@ type Config struct {
130130
Ring RingConfig `yaml:"ring"`
131131
FlushCheckPeriod time.Duration `yaml:"flush_period"`
132132

133-
EnableAPI bool `yaml:"enable_api"`
134-
APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"`
135-
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
133+
EnableAPI bool `yaml:"enable_api"`
134+
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
136135

137136
EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
138137
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
@@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
200199
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
201200
f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
202201
f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
203-
f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.")
204202
f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.")
205203
f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
206204
f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
@@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
217215
cfg.RingCheckPeriod = 5 * time.Second
218216
}
219217

218+
func (cfg *Config) RulesBackupEnabled() bool {
219+
// If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule,
220+
// the rest of the replica will store the rule groups as backup only for API HA.
221+
return cfg.Ring.ReplicationFactor > 1
222+
}
223+
220224
// MultiTenantManager is the interface of interaction with a Manager that is tenant aware.
221225
type MultiTenantManager interface {
222226
// SyncRuleGroups is used to sync the Manager with rules from the RuleStore.
@@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) {
581585
// This will also delete local group files for users that are no longer in 'configs' map.
582586
r.manager.SyncRuleGroups(ctx, loadedConfigs)
583587

584-
if r.cfg.APIEnableRulesBackup {
588+
if r.cfg.RulesBackupEnabled() {
585589
r.manager.BackUpRuleGroups(ctx, backupConfigs)
586590
}
587591
}
@@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou
604608
if err != nil {
605609
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err)
606610
}
607-
if r.cfg.APIEnableRulesBackup {
611+
if r.cfg.RulesBackupEnabled() {
608612
loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs)
609613
if err != nil {
610614
level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err)
@@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
685689
if len(owned) > 0 {
686690
ownedConfigs[userID] = owned
687691
}
688-
if r.cfg.APIEnableRulesBackup {
692+
if r.cfg.RulesBackupEnabled() {
689693
backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
690694
if len(backup) > 0 {
691695
backedUpConfigs[userID] = backup
@@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
748752

749753
filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
750754
var filterBackup []*rulespb.RuleGroupDesc
751-
if r.cfg.APIEnableRulesBackup {
755+
if r.cfg.RulesBackupEnabled() {
752756
filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
753757
}
754758
if len(filterOwned) == 0 && len(filterBackup) == 0 {
@@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
11211125
)
11221126

11231127
zoneByAddress := make(map[string]string)
1124-
if r.cfg.APIEnableRulesBackup {
1128+
if r.cfg.RulesBackupEnabled() {
11251129
for _, ruler := range rulers.Instances {
11261130
zoneByAddress[ruler.Addr] = ruler.Zone
11271131
}
@@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
11461150
if err != nil {
11471151
level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err)
11481152
r.rulerGetRulesFailures.WithLabelValues(addr).Inc()
1149-
// If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should
1153+
// If rules backup is enabled and there are enough rulers replicating the rules, we should
11501154
// be able to handle failures.
1151-
if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor {
1155+
if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor {
11521156
mtx.Lock()
11531157
failedZones[zoneByAddress[addr]] = struct{}{}
11541158
errCount += 1
@@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
11681172
return nil
11691173
})
11701174

1171-
if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) {
1175+
if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) {
11721176
merged = mergeGroupStateDesc(merged)
11731177
}
11741178

@@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er
11831187
return nil, fmt.Errorf("no user id found in context")
11841188
}
11851189

1186-
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup)
1190+
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled())
11871191
if err != nil {
11881192
return nil, err
11891193
}

pkg/ruler/ruler_ring.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic
144144
// to 0, and then we update them whether zone-awareness is enabled or not.
145145
maxErrors := 0
146146
maxUnavailableZones := 0
147-
if cfg.ZoneAwarenessEnabled {
147+
// Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone
148+
// and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since
149+
// rules are still replicated to rulers in the same zone.
150+
if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 {
148151
numReplicatedZones := min(len(ringZones), r.ReplicationFactor())
149152
// Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we
150153
// also need to handle case when RF < number of zones.

pkg/ruler/ruler_ring_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) {
180180
"z2": {},
181181
},
182182
},
183+
"should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": {
184+
ringInstances: map[string]ring.InstanceDesc{
185+
"instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"},
186+
"instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"},
187+
"instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"},
188+
"instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"},
189+
},
190+
ringHeartbeatTimeout: time.Minute,
191+
ringReplicationFactor: 3,
192+
expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"},
193+
enableAZReplication: true,
194+
expectedFailedZones: map[string]struct{}{
195+
"z1": {},
196+
},
197+
expectedMaxUnavailableZones: 0,
198+
expectedMaxError: 1,
199+
},
183200
}
184201

185202
for testName, testData := range tests {

0 commit comments

Comments
 (0)