Skip to content

Commit 7ec57ac

Browse files
Added UserReplicaGroupMetrics (#6463)
* Added UserReplicaGroupMetrics Signed-off-by: 7h3-3mp7y-m4n <[email protected]> * Adding the changed in ha_tracker Signed-off-by: 7h3-3mp7y-m4n <[email protected]> * Added testcase and minor changes Signed-off-by: 7h3-3mp7y-m4n <[email protected]> * Added changes to test Signed-off-by: 7h3-3mp7y-m4n <[email protected]> --------- Signed-off-by: 7h3-3mp7y-m4n <[email protected]>
1 parent c2c4827 commit 7ec57ac

File tree

2 files changed

+44
-1
lines changed

2 files changed

+44
-1
lines changed

pkg/ha/ha_tracker.go

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ import (
2323
"github.com/cortexproject/cortex/pkg/util/services"
2424
)
2525

26+
const (
27+
userReplicaGroupUpdateInterval = 30 * time.Second
28+
)
29+
2630
var (
2731
errNegativeUpdateTimeoutJitterMax = errors.New("HA tracker max update timeout jitter shouldn't be negative")
2832
errInvalidFailoverTimeout = "HA Tracker failover timeout (%v) must be at least 1s greater than update timeout - max jitter (%v)"
@@ -137,6 +141,7 @@ type HATracker struct {
137141
electedReplicaTimestamp *prometheus.GaugeVec
138142
electedReplicaPropagationTime prometheus.Histogram
139143
kvCASCalls *prometheus.CounterVec
144+
userReplicaGroupCount *prometheus.GaugeVec
140145

141146
cleanupRuns prometheus.Counter
142147
replicasMarkedForDeletion prometheus.Counter
@@ -182,6 +187,11 @@ func NewHATracker(cfg HATrackerConfig, limits HATrackerLimits, trackerStatusConf
182187
Help: "The total number of CAS calls to the KV store for a user ID/cluster.",
183188
}, []string{"user", "cluster"}),
184189

190+
userReplicaGroupCount: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
191+
Name: "ha_tracker_user_replica_group_count",
192+
Help: "Number of HA replica groups tracked for each user.",
193+
}, []string{"user"}),
194+
185195
cleanupRuns: promauto.With(reg).NewCounter(prometheus.CounterOpts{
186196
Name: "ha_tracker_replicas_cleanup_started_total",
187197
Help: "Number of elected replicas cleanup loops started.",
@@ -227,11 +237,26 @@ func (c *HATracker) loop(ctx context.Context) error {
227237

228238
// Start cleanup loop. It will stop when context is done.
229239
wg := sync.WaitGroup{}
230-
wg.Add(1)
240+
wg.Add(2)
231241
go func() {
232242
defer wg.Done()
233243
c.cleanupOldReplicasLoop(ctx)
234244
}()
245+
// Start periodic update of user replica group count.
246+
go func() {
247+
defer wg.Done()
248+
ticker := time.NewTicker(userReplicaGroupUpdateInterval)
249+
defer ticker.Stop()
250+
251+
for {
252+
select {
253+
case <-ticker.C:
254+
c.updateUserReplicaGroupCount()
255+
case <-ctx.Done():
256+
return
257+
}
258+
}
259+
}()
235260

236261
// The KVStore config we gave when creating c should have contained a prefix,
237262
// which would have given us a prefixed KVStore client. So, we can pass empty string here.
@@ -504,6 +529,9 @@ func (c *HATracker) CleanupHATrackerMetricsForUser(userID string) {
504529
if err := util.DeleteMatchingLabels(c.kvCASCalls, filter); err != nil {
505530
level.Warn(c.logger).Log("msg", "failed to remove cortex_ha_tracker_kv_store_cas_total metric for user", "user", userID, "err", err)
506531
}
532+
if err := util.DeleteMatchingLabels(c.userReplicaGroupCount, filter); err != nil {
533+
level.Warn(c.logger).Log("msg", "failed to remove cortex_ha_tracker_user_replica_group_count metric for user", "user", userID, "err", err)
534+
}
507535
}
508536

509537
// Returns a snapshot of the currently elected replicas. Useful for status display
@@ -521,3 +549,12 @@ func (c *HATracker) SnapshotElectedReplicas() map[string]ReplicaDesc {
521549
}
522550
return electedCopy
523551
}
552+
553+
func (t *HATracker) updateUserReplicaGroupCount() {
554+
t.electedLock.RLock()
555+
defer t.electedLock.RUnlock()
556+
557+
for user, groups := range t.replicaGroups {
558+
t.userReplicaGroupCount.WithLabelValues(user).Set(float64(len(groups)))
559+
}
560+
}

pkg/ha/ha_tracker_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ func TestHATracker_MetricsCleanup(t *testing.T) {
629629
"cortex_ha_tracker_elected_replica_changes_total",
630630
"cortex_ha_tracker_elected_replica_timestamp_seconds",
631631
"cortex_ha_tracker_kv_store_cas_total",
632+
"cortex_ha_tracker_user_replica_group_count",
632633
}
633634

634635
tr.electedReplicaChanges.WithLabelValues("userA", "replicaGroup1").Add(5)
@@ -640,6 +641,7 @@ func TestHATracker_MetricsCleanup(t *testing.T) {
640641
tr.kvCASCalls.WithLabelValues("userA", "replicaGroup1").Add(5)
641642
tr.kvCASCalls.WithLabelValues("userA", "replicaGroup2").Add(8)
642643
tr.kvCASCalls.WithLabelValues("userB", "replicaGroup").Add(10)
644+
tr.userReplicaGroupCount.WithLabelValues("userA").Add(5)
643645

644646
require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
645647
# HELP cortex_ha_tracker_elected_replica_changes_total The total number of times the elected replica has changed for a user ID/cluster.
@@ -659,6 +661,10 @@ func TestHATracker_MetricsCleanup(t *testing.T) {
659661
cortex_ha_tracker_kv_store_cas_total{cluster="replicaGroup",user="userB"} 10
660662
cortex_ha_tracker_kv_store_cas_total{cluster="replicaGroup1",user="userA"} 5
661663
cortex_ha_tracker_kv_store_cas_total{cluster="replicaGroup2",user="userA"} 8
664+
665+
# HELP cortex_ha_tracker_user_replica_group_count Number of HA replica groups tracked for each user.
666+
# TYPE cortex_ha_tracker_user_replica_group_count gauge
667+
cortex_ha_tracker_user_replica_group_count{user="userA"} 5
662668
`), metrics...))
663669

664670
tr.CleanupHATrackerMetricsForUser("userA")

0 commit comments

Comments
 (0)