From 059a93691c74571742e6f905790c6091dcdeb080 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Fri, 26 Apr 2024 11:18:49 -0700 Subject: [PATCH 1/3] fix random string used in test allocating 50MB in Cortex binary (#5903) Signed-off-by: Ben Ye --- pkg/distributor/distributor_test.go | 12 ++++++++++-- pkg/util/strings_test.go | 5 +++-- pkg/util/test_util.go | 12 +++++------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index fcb841efb7..ef064e467d 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -57,6 +57,14 @@ var ( emptyResponse = &cortexpb.WriteResponse{} ) +var ( + randomStrings = []string{} +) + +func init() { + randomStrings = util.GenerateRandomStrings() +} + func TestConfig_Validate(t *testing.T) { t.Parallel() tests := map[string]struct { @@ -2466,8 +2474,8 @@ func prepare(tb testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, [] // Strings to be used for get labels values/Names var unusedStrings []string if cfg.lblValuesPerIngester > 0 { - unusedStrings = make([]string, min(len(util.RandomStrings), cfg.numIngesters*cfg.lblValuesPerIngester)) - copy(unusedStrings, util.RandomStrings) + unusedStrings = make([]string, min(len(randomStrings), cfg.numIngesters*cfg.lblValuesPerIngester)) + copy(unusedStrings, randomStrings) } s := &prepState{ unusedStrings: unusedStrings, diff --git a/pkg/util/strings_test.go b/pkg/util/strings_test.go index 75496c7c46..de4cc28092 100644 --- a/pkg/util/strings_test.go +++ b/pkg/util/strings_test.go @@ -48,6 +48,7 @@ func BenchmarkMergeSlicesParallel(b *testing.B) { }, } + randomStrings := GenerateRandomStrings() type ParallelismType int const ( @@ -58,9 +59,9 @@ func BenchmarkMergeSlicesParallel(b *testing.B) { for _, tc := range testCases { input := make([][]string, tc.inputSize) - unusedStrings := make([]string, min(len(RandomStrings), tc.inputSize*tc.stringsPerInput)) + unusedStrings := make([]string, min(len(randomStrings), tc.inputSize*tc.stringsPerInput)) usedStrings := make([]string, 0, len(unusedStrings)) - copy(unusedStrings, RandomStrings) + copy(unusedStrings, randomStrings) for i := 0; i < tc.inputSize; i++ { stringsToBeReused := make([]string, len(usedStrings)) diff --git a/pkg/util/test_util.go b/pkg/util/test_util.go index 4f9c65f010..7cc95abe2f 100644 --- a/pkg/util/test_util.go +++ b/pkg/util/test_util.go @@ -5,12 +5,9 @@ import ( "strings" ) -var ( - randomChar = "0123456789abcdef" - RandomStrings = []string{} -) - -func init() { +func GenerateRandomStrings() []string { + randomChar := "0123456789abcdef" + randomStrings := make([]string, 0, 1000000) sb := strings.Builder{} for i := 0; i < 1000000; i++ { sb.Reset() @@ -18,6 +15,7 @@ func init() { for j := 0; j < 14; j++ { sb.WriteByte(randomChar[rand.Int()%len(randomChar)]) } - RandomStrings = append(RandomStrings, sb.String()) + randomStrings = append(randomStrings, sb.String()) } + return randomStrings } From 710cd3490b7ea8e5e38fe6d8c355d706af5ce376 Mon Sep 17 00:00:00 2001 From: Emmanuel Lodovice Date: Sat, 27 Apr 2024 02:46:14 +0800 Subject: [PATCH 2/3] =?UTF-8?q?Remove=20APIEnableRulesBackup=20config=20an?= =?UTF-8?q?d=20use=20Ring.ReplicationFactor=20ins=E2=80=A6=20(#5901)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove APIEnableRulesBackup config and use Ring.ReplicationFactor instead Signed-off-by: Emmanuel Lodovice * Update changelog Signed-off-by: Emmanuel Lodovice * Update change log Signed-off-by: Emmanuel Lodovice * Consider AZ awareness disabled if only 1 zone is present in list rules Signed-off-by: Emmanuel Lodovice --------- Signed-off-by: Emmanuel Lodovice --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 9 -- integration/ruler_test.go | 9 +- pkg/ruler/manager.go | 2 +- pkg/ruler/manager_test.go | 4 +- pkg/ruler/ruler.go | 30 +++--- pkg/ruler/ruler_ring.go | 5 +- pkg/ruler/ruler_ring_test.go | 17 +++ pkg/ruler/ruler_test.go | 110 ++++++++++++-------- 9 files changed, 111 insertions(+), 76 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 163a029fbf..a033a3d7d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Ruler: Remove `experimental.ruler.api-enable-rules-backup` flag and use `ruler.ring.replication-factor` to check if rules backup is enabled ## 1.17.0 in progress diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 57909431da..30bbee288c 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4271,15 +4271,6 @@ ring: # CLI flag: -experimental.ruler.enable-api [enable_api: | default = false] -# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers -# with default state (state before any evaluation) and send this copy in list -# API requests as backup in case the ruler who owns the rule fails to send its -# rules. This allows the rules API to handle ruler outage by returning rules -# with default state. Ring replication-factor needs to be set to 2 or more for -# this to be useful. -# CLI flag: -experimental.ruler.api-enable-rules-backup -[api_enable_rules_backup: | default = false] - # EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API # response. If there are duplicate rules the rule with the latest evaluation # timestamp will be kept. diff --git a/integration/ruler_test.go b/integration/ruler_test.go index 81e2440c76..be32583595 100644 --- a/integration/ruler_test.go +++ b/integration/ruler_test.go @@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) { testRulerAPIWithSharding(t, true) } -func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) { +func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) { const numRulesGroups = 100 random := rand.New(rand.NewSource(time.Now().UnixNano())) @@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) { // Enable the bucket index so we can skip the initial bucket scan. "-blocks-storage.bucket-store.bucket-index.enabled": "true", } - if enableAPIRulesBackup { + if enableRulesBackup { overrides["-ruler.ring.replication-factor"] = "3" - overrides["-experimental.ruler.api-enable-rules-backup"] = "true" } rulerFlags := mergeFlags( BlocksStorageFlags(), @@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) { }, } // For each test case, fetch the rules with configured filters, and ensure the results match. - if enableAPIRulesBackup { - err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down + if enableRulesBackup { + err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down require.NoError(t, err) } for name, tc := range testCases { diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 57d2d2907f..eb0e5ce9e4 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva registry: reg, logger: logger, } - if cfg.APIEnableRulesBackup { + if cfg.RulesBackupEnabled() { m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg) } return m, nil diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index d88d47e661..3d539c913d 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) { 1 * time.Millisecond, } ruleManagerFactory := RuleManagerFactory(nil, waitDurations) - m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) + config := Config{RulePath: dir} + config.Ring.ReplicationFactor = 3 + m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) require.NoError(t, err) const user1 = "testUser" diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 19497db85e..2fbfca361c 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -130,9 +130,8 @@ type Config struct { Ring RingConfig `yaml:"ring"` FlushCheckPeriod time.Duration `yaml:"flush_period"` - EnableAPI bool `yaml:"enable_api"` - APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"` - APIDeduplicateRules bool `yaml:"api_deduplicate_rules"` + EnableAPI bool `yaml:"enable_api"` + APIDeduplicateRules bool `yaml:"api_deduplicate_rules"` EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"` DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` @@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.") f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers") f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api") - f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.") f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.") f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`) f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`) @@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { cfg.RingCheckPeriod = 5 * time.Second } +func (cfg *Config) RulesBackupEnabled() bool { + // If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule, + // the rest of the replica will store the rule groups as backup only for API HA. + return cfg.Ring.ReplicationFactor > 1 +} + // MultiTenantManager is the interface of interaction with a Manager that is tenant aware. type MultiTenantManager interface { // SyncRuleGroups is used to sync the Manager with rules from the RuleStore. @@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) { // This will also delete local group files for users that are no longer in 'configs' map. r.manager.SyncRuleGroups(ctx, loadedConfigs) - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { r.manager.BackUpRuleGroups(ctx, backupConfigs) } } @@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou if err != nil { level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err) } - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs) if err != nil { level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err) @@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp if len(owned) > 0 { ownedConfigs[userID] = owned } - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) if len(backup) > 0 { backedUpConfigs[userID] = backup @@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) var filterBackup []*rulespb.RuleGroupDesc - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) } if len(filterOwned) == 0 && len(filterBackup) == 0 { @@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest ) zoneByAddress := make(map[string]string) - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { for _, ruler := range rulers.Instances { zoneByAddress[ruler.Addr] = ruler.Zone } @@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest if err != nil { level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err) r.rulerGetRulesFailures.WithLabelValues(addr).Inc() - // If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should + // If rules backup is enabled and there are enough rulers replicating the rules, we should // be able to handle failures. - if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor { + if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor { mtx.Lock() failedZones[zoneByAddress[addr]] = struct{}{} errCount += 1 @@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest return nil }) - if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) { + if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) { merged = mergeGroupStateDesc(merged) } @@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er return nil, fmt.Errorf("no user id found in context") } - groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup) + groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled()) if err != nil { return nil, err } diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 2658d6e8e3..cac36e270d 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic // to 0, and then we update them whether zone-awareness is enabled or not. maxErrors := 0 maxUnavailableZones := 0 - if cfg.ZoneAwarenessEnabled { + // Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone + // and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since + // rules are still replicated to rulers in the same zone. + if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 { numReplicatedZones := min(len(ringZones), r.ReplicationFactor()) // Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we // also need to handle case when RF < number of zones. diff --git a/pkg/ruler/ruler_ring_test.go b/pkg/ruler/ruler_ring_test.go index f9bfaeb03e..95f8009912 100644 --- a/pkg/ruler/ruler_ring_test.go +++ b/pkg/ruler/ruler_ring_test.go @@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) { "z2": {}, }, }, + "should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": { + ringInstances: map[string]ring.InstanceDesc{ + "instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"}, + "instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"}, + "instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"}, + "instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"}, + }, + ringHeartbeatTimeout: time.Minute, + ringReplicationFactor: 3, + expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"}, + enableAZReplication: true, + expectedFailedZones: map[string]struct{}{ + "z1": {}, + }, + expectedMaxUnavailableZones: 0, + expectedMaxError: 1, + }, } for testName, testData := range tests { diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 22d52aa649..6932a8e342 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -349,7 +349,7 @@ func TestGetRules(t *testing.T) { rulerStateMap map[string]ring.InstanceState rulerAZMap map[string]string expectedError error - enableAPIRulesBackup bool + replicationFactor int enableZoneAwareReplication bool } @@ -473,6 +473,11 @@ func TestGetRules(t *testing.T) { "ruler2": "b", "ruler3": "c", } + rulerAZSingleZone := map[string]string{ + "ruler1": "a", + "ruler2": "a", + "ruler3": "a", + } expectedRules := expectedRulesMap{ "ruler1": map[string]rulespb.RuleGroupList{ @@ -541,7 +546,7 @@ func TestGetRules(t *testing.T) { "user2": 9, "user3": 3, }, - enableAPIRulesBackup: true, + replicationFactor: 3, expectedClientCallCount: len(expectedRules), }, "Shuffle Sharding and ShardSize = 2 with Rule Type Filter": { @@ -633,11 +638,11 @@ func TestGetRules(t *testing.T) { expectedClientCallCount: 0, }, "Shuffle Sharding and ShardSize = 3 with API Rules backup enabled": { - sharding: true, - shuffleShardSize: 3, - shardingStrategy: util.ShardingStrategyShuffle, - rulerStateMap: rulerStateMapAllActive, - enableAPIRulesBackup: true, + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + rulerStateMap: rulerStateMapAllActive, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -649,11 +654,11 @@ func TestGetRules(t *testing.T) { expectedClientCallCount: 3, }, "Shuffle Sharding and ShardSize = 3 with API Rules backup enabled and one ruler is in Pending state": { - sharding: true, - shuffleShardSize: 3, - shardingStrategy: util.ShardingStrategyShuffle, - rulerStateMap: rulerStateMapOnePending, - enableAPIRulesBackup: true, + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + rulerStateMap: rulerStateMapOnePending, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -665,11 +670,11 @@ func TestGetRules(t *testing.T) { expectedClientCallCount: 2, // one of the ruler is pending, so we don't expect that ruler to be called }, "Shuffle Sharding and ShardSize = 3 with API Rules backup enabled and two ruler is in Pending state": { - sharding: true, - shuffleShardSize: 3, - shardingStrategy: util.ShardingStrategyShuffle, - rulerStateMap: rulerStateMapTwoPending, - enableAPIRulesBackup: true, + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + rulerStateMap: rulerStateMapTwoPending, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -682,7 +687,7 @@ func TestGetRules(t *testing.T) { enableZoneAwareReplication: true, rulerStateMap: rulerStateMapAllActive, rulerAZMap: rulerAZEvenSpread, - enableAPIRulesBackup: true, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -700,7 +705,25 @@ func TestGetRules(t *testing.T) { enableZoneAwareReplication: true, rulerStateMap: rulerStateMapOnePending, rulerAZMap: rulerAZEvenSpread, - enableAPIRulesBackup: true, + replicationFactor: 3, + rulesRequest: RulesRequest{ + Type: recordingRuleFilter, + }, + expectedCount: map[string]int{ + "user1": 3, + "user2": 5, + "user3": 1, + }, + expectedClientCallCount: 2, // one of the ruler is pending, so we don't expect that ruler to be called + }, + "Shuffle Sharding and ShardSize = 3 and AZ replication with API Rules backup enabled and one ruler in pending state and rulers are in same az": { + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + enableZoneAwareReplication: true, + rulerStateMap: rulerStateMapOnePending, + rulerAZMap: rulerAZSingleZone, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -718,7 +741,7 @@ func TestGetRules(t *testing.T) { enableZoneAwareReplication: true, rulerStateMap: rulerStateMapTwoPending, rulerAZMap: rulerAZEvenSpread, - enableAPIRulesBackup: true, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -741,7 +764,6 @@ func TestGetRules(t *testing.T) { cfg.ShardingStrategy = tc.shardingStrategy cfg.EnableSharding = tc.sharding - cfg.APIEnableRulesBackup = tc.enableAPIRulesBackup cfg.Ring = RingConfig{ InstanceID: id, @@ -751,8 +773,8 @@ func TestGetRules(t *testing.T) { }, ReplicationFactor: 1, } - if tc.enableAPIRulesBackup { - cfg.Ring.ReplicationFactor = 3 + if tc.replicationFactor > 0 { + cfg.Ring.ReplicationFactor = tc.replicationFactor cfg.Ring.ZoneAwarenessEnabled = tc.enableZoneAwareReplication } if tc.enableZoneAwareReplication { @@ -877,7 +899,7 @@ func TestGetRules(t *testing.T) { numberOfRulers := len(rulerAddrMap) require.Equal(t, totalConfiguredRules*numberOfRulers, totalLoadedRules) } - if tc.enableAPIRulesBackup && tc.sharding && tc.expectedError == nil { + if tc.replicationFactor > 1 && tc.sharding && tc.expectedError == nil { // all rules should be backed up require.Equal(t, totalConfiguredRules, len(ruleBackupCount)) var hasUnhealthyRuler bool @@ -896,7 +918,7 @@ func TestGetRules(t *testing.T) { } } } else { - // If APIEnableRulesBackup is disabled, rulers should not back up any rules + // If rules backup is disabled, rulers should not back up any rules require.Equal(t, 0, len(ruleBackupCount)) } }) @@ -983,7 +1005,6 @@ func TestGetRulesFromBackup(t *testing.T) { cfg.ShardingStrategy = util.ShardingStrategyShuffle cfg.EnableSharding = true - cfg.APIEnableRulesBackup = true cfg.EvaluationInterval = 5 * time.Minute cfg.Ring = RingConfig{ @@ -1141,16 +1162,15 @@ func TestSharding(t *testing.T) { type expectedRulesMap map[string]map[string]rulespb.RuleGroupList type testCase struct { - sharding bool - shardingStrategy string - enableAPIRulesBackup bool - replicationFactor int - shuffleShardSize int - setupRing func(*ring.Desc) - enabledUsers []string - disabledUsers []string - expectedRules expectedRulesMap - expectedBackupRules expectedRulesMap + sharding bool + shardingStrategy string + replicationFactor int + shuffleShardSize int + setupRing func(*ring.Desc) + enabledUsers []string + disabledUsers []string + expectedRules expectedRulesMap + expectedBackupRules expectedRulesMap } const ( @@ -1524,12 +1544,11 @@ func TestSharding(t *testing.T) { }, "shuffle sharding, three rulers, shard size 2, enable api backup": { - sharding: true, - replicationFactor: 2, - shardingStrategy: util.ShardingStrategyShuffle, - enableAPIRulesBackup: true, - shuffleShardSize: 2, - enabledUsers: []string{user1}, + sharding: true, + replicationFactor: 2, + shardingStrategy: util.ShardingStrategyShuffle, + shuffleShardSize: 2, + enabledUsers: []string{user1}, setupRing: func(desc *ring.Desc) { desc.AddIngester(ruler1, ruler1Addr, "", sortTokens([]uint32{userToken(user1, 0) + 1, user1Group1Token + 1}), ring.ACTIVE, time.Now()) @@ -1566,9 +1585,8 @@ func TestSharding(t *testing.T) { setupRuler := func(id string, host string, port int, forceRing *ring.Ring) *Ruler { store := newMockRuleStore(allRules, nil) cfg := Config{ - EnableSharding: tc.sharding, - APIEnableRulesBackup: tc.enableAPIRulesBackup, - ShardingStrategy: tc.shardingStrategy, + EnableSharding: tc.sharding, + ShardingStrategy: tc.shardingStrategy, Ring: RingConfig{ InstanceID: id, InstanceAddr: host, @@ -1657,7 +1675,7 @@ func TestSharding(t *testing.T) { require.Equal(t, tc.expectedRules, expected) - if !tc.enableAPIRulesBackup { + if tc.replicationFactor <= 1 { require.Equal(t, 0, len(expectedBackup[ruler1])) require.Equal(t, 0, len(expectedBackup[ruler2])) require.Equal(t, 0, len(expectedBackup[ruler3])) From e9704d1da130fb152188927e23b27c769a4f6ce8 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Sat, 27 Apr 2024 15:45:40 -0700 Subject: [PATCH 3/3] update version and changelog Signed-off-by: Ben Ye --- CHANGELOG.md | 3 +-- VERSION | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a033a3d7d7..5b1c0ba801 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,6 @@ # Changelog ## master / unreleased -* [CHANGE] Ruler: Remove `experimental.ruler.api-enable-rules-backup` flag and use `ruler.ring.replication-factor` to check if rules backup is enabled ## 1.17.0 in progress @@ -25,7 +24,7 @@ * [FEATURE] Ruler: Add `ruler.concurrent-evals-enabled` flag to enable concurrent evaluation within a single rule group for independent rules. Maximum concurrency can be configured via `ruler.max-concurrent-evals`. #5766 * [FEATURE] Distributor Queryable: Experimental: Add config `zone_results_quorum_metadata`. When querying ingesters using metadata APIs such as label names and values, only results from quorum number of zones will be included and merged. #5779 * [FEATURE] Storage Cache Clients: Add config `set_async_circuit_breaker_config` to utilize the circuit breaker pattern for dynamically thresholding asynchronous set operations. Implemented in both memcached and redis cache clients. #5789 -* [FEATURE] Ruler: Add experimental `experimental.ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add experimental `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will just hold a copy as backup. Add experimental `experimental.ruler.api-enable-rules-backup` flag to configure rulers to send the rule group backups stored in the replicaset to handle events when a ruler is down during an API request to list rules. #5782 +* [FEATURE] Ruler: Add experimental `experimental.ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add experimental `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will be used as backup to handle events when a ruler is down during an API request to list rules. #5782 #5901 * [FEATURE] Ring: Add experimental `-ingester.tokens-generator-strategy=minimize-spread` flag to enable the new minimize spread token generator strategy. #5855 * [FEATURE] Ring Status Page: Add `Ownership Diff From Expected` column in the ring table to indicate the extent to which the ownership of a specific ingester differs from the expected ownership. #5889 * [ENHANCEMENT] Ingester: Add per-tenant new metric `cortex_ingester_tsdb_data_replay_duration_seconds`. #5477 diff --git a/VERSION b/VERSION index d2466b0cc9..b91f9c7f4d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.17.0-rc.0 +1.17.0-rc.1