Skip to content

Commit 15a9eae

Browse files
authored
fix query frontend per tenant metrics leak when cleaning up user labels (#6698)
* fix query frontend per tenant metrics leak when cleaning up user labels Signed-off-by: Ben Ye <[email protected]> * changelog Signed-off-by: Ben Ye <[email protected]> --------- Signed-off-by: Ben Ye <[email protected]>
1 parent 06cedd9 commit 15a9eae

File tree

3 files changed

+143
-12
lines changed

3 files changed

+143
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
* [BUGFIX] Ingester: Add check to avoid query 5xx when closing tsdb. #6616
3131
* [BUGFIX] Querier: Fix panic when marshaling QueryResultRequest. #6601
3232
* [BUGFIX] Ingester: Avoid resharding for query when restart readonly ingesters. #6642
33+
* [BUGFIX] Query Frontend: Fix query frontend per `user` metrics clean up. #6698
3334

3435
## 1.19.0 2025-02-27
3536

pkg/frontend/transport/handler.go

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -167,25 +167,49 @@ func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config,
167167
[]string{"reason", "source", "user"},
168168
)
169169

170-
h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(func(user string) {
171-
h.querySeconds.DeleteLabelValues(user)
172-
h.queryFetchedSeries.DeleteLabelValues(user)
173-
h.queryFetchedSamples.DeleteLabelValues(user)
174-
h.queryScannedSamples.DeleteLabelValues(user)
175-
h.queryPeakSamples.DeleteLabelValues(user)
176-
h.queryChunkBytes.DeleteLabelValues(user)
177-
h.queryDataBytes.DeleteLabelValues(user)
178-
if err := util.DeleteMatchingLabels(h.rejectedQueries, map[string]string{"user": user}); err != nil {
179-
level.Warn(log).Log("msg", "failed to remove cortex_rejected_queries_total metric for user", "user", user, "err", err)
180-
}
181-
})
170+
h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(h.cleanupMetricsForInactiveUser)
182171
// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
183172
_ = h.activeUsers.StartAsync(context.Background())
184173
}
185174

186175
return h
187176
}
188177

178+
func (h *Handler) cleanupMetricsForInactiveUser(user string) {
179+
if !h.cfg.QueryStatsEnabled {
180+
return
181+
}
182+
183+
// Create a map with the user label to match
184+
userLabel := map[string]string{"user": user}
185+
186+
// Clean up all metrics for the user
187+
if err := util.DeleteMatchingLabels(h.querySeconds, userLabel); err != nil {
188+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_seconds_total metric for user", "user", user, "err", err)
189+
}
190+
if err := util.DeleteMatchingLabels(h.queryFetchedSeries, userLabel); err != nil {
191+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_fetched_series_total metric for user", "user", user, "err", err)
192+
}
193+
if err := util.DeleteMatchingLabels(h.queryFetchedSamples, userLabel); err != nil {
194+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_samples_total metric for user", "user", user, "err", err)
195+
}
196+
if err := util.DeleteMatchingLabels(h.queryScannedSamples, userLabel); err != nil {
197+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_samples_scanned_total metric for user", "user", user, "err", err)
198+
}
199+
if err := util.DeleteMatchingLabels(h.queryPeakSamples, userLabel); err != nil {
200+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_peak_samples metric for user", "user", user, "err", err)
201+
}
202+
if err := util.DeleteMatchingLabels(h.queryChunkBytes, userLabel); err != nil {
203+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_fetched_chunks_bytes_total metric for user", "user", user, "err", err)
204+
}
205+
if err := util.DeleteMatchingLabels(h.queryDataBytes, userLabel); err != nil {
206+
level.Warn(h.log).Log("msg", "failed to remove cortex_query_fetched_data_bytes_total metric for user", "user", user, "err", err)
207+
}
208+
if err := util.DeleteMatchingLabels(h.rejectedQueries, userLabel); err != nil {
209+
level.Warn(h.log).Log("msg", "failed to remove cortex_rejected_queries_total metric for user", "user", user, "err", err)
210+
}
211+
}
212+
189213
func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
190214
var (
191215
stats *querier_stats.QueryStats

pkg/frontend/transport/handler_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,3 +637,109 @@ func Test_TenantFederation_MaxTenant(t *testing.T) {
637637
})
638638
}
639639
}
640+
641+
func TestHandlerMetricsCleanup(t *testing.T) {
642+
reg := prometheus.NewPedanticRegistry()
643+
handler := NewHandler(HandlerConfig{QueryStatsEnabled: true}, tenantfederation.Config{}, http.DefaultTransport, log.NewNopLogger(), reg)
644+
645+
user1 := "user1"
646+
user2 := "user2"
647+
source := "api"
648+
649+
// Simulate activity for user1
650+
handler.querySeconds.WithLabelValues(source, user1).Add(1.0)
651+
handler.queryFetchedSeries.WithLabelValues(source, user1).Add(100)
652+
handler.queryFetchedSamples.WithLabelValues(source, user1).Add(1000)
653+
handler.queryScannedSamples.WithLabelValues(source, user1).Add(2000)
654+
handler.queryPeakSamples.WithLabelValues(source, user1).Observe(500)
655+
handler.queryChunkBytes.WithLabelValues(source, user1).Add(1024)
656+
handler.queryDataBytes.WithLabelValues(source, user1).Add(2048)
657+
handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user1).Add(5)
658+
659+
// Simulate activity for user2
660+
handler.querySeconds.WithLabelValues(source, user2).Add(2.0)
661+
handler.queryFetchedSeries.WithLabelValues(source, user2).Add(200)
662+
handler.queryFetchedSamples.WithLabelValues(source, user2).Add(2000)
663+
handler.queryScannedSamples.WithLabelValues(source, user2).Add(4000)
664+
handler.queryPeakSamples.WithLabelValues(source, user2).Observe(1000)
665+
handler.queryChunkBytes.WithLabelValues(source, user2).Add(2048)
666+
handler.queryDataBytes.WithLabelValues(source, user2).Add(4096)
667+
handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user2).Add(10)
668+
669+
// Verify initial state - both users should have metrics
670+
require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(`
671+
# HELP cortex_query_seconds_total Total amount of wall clock time spend processing queries.
672+
# TYPE cortex_query_seconds_total counter
673+
cortex_query_seconds_total{source="api",user="user1"} 1
674+
cortex_query_seconds_total{source="api",user="user2"} 2
675+
# HELP cortex_query_fetched_series_total Number of series fetched to execute a query.
676+
# TYPE cortex_query_fetched_series_total counter
677+
cortex_query_fetched_series_total{source="api",user="user1"} 100
678+
cortex_query_fetched_series_total{source="api",user="user2"} 200
679+
# HELP cortex_query_samples_total Number of samples fetched to execute a query.
680+
# TYPE cortex_query_samples_total counter
681+
cortex_query_samples_total{source="api",user="user1"} 1000
682+
cortex_query_samples_total{source="api",user="user2"} 2000
683+
# HELP cortex_query_samples_scanned_total Number of samples scanned to execute a query.
684+
# TYPE cortex_query_samples_scanned_total counter
685+
cortex_query_samples_scanned_total{source="api",user="user1"} 2000
686+
cortex_query_samples_scanned_total{source="api",user="user2"} 4000
687+
# HELP cortex_query_peak_samples Highest count of samples considered to execute a query.
688+
# TYPE cortex_query_peak_samples histogram
689+
cortex_query_peak_samples_bucket{source="api",user="user1",le="+Inf"} 1
690+
cortex_query_peak_samples_sum{source="api",user="user1"} 500
691+
cortex_query_peak_samples_count{source="api",user="user1"} 1
692+
cortex_query_peak_samples_bucket{source="api",user="user2",le="+Inf"} 1
693+
cortex_query_peak_samples_sum{source="api",user="user2"} 1000
694+
cortex_query_peak_samples_count{source="api",user="user2"} 1
695+
# HELP cortex_query_fetched_chunks_bytes_total Size of all chunks fetched to execute a query in bytes.
696+
# TYPE cortex_query_fetched_chunks_bytes_total counter
697+
cortex_query_fetched_chunks_bytes_total{source="api",user="user1"} 1024
698+
cortex_query_fetched_chunks_bytes_total{source="api",user="user2"} 2048
699+
# HELP cortex_query_fetched_data_bytes_total Size of all data fetched to execute a query in bytes.
700+
# TYPE cortex_query_fetched_data_bytes_total counter
701+
cortex_query_fetched_data_bytes_total{source="api",user="user1"} 2048
702+
cortex_query_fetched_data_bytes_total{source="api",user="user2"} 4096
703+
# HELP cortex_rejected_queries_total The total number of queries that were rejected.
704+
# TYPE cortex_rejected_queries_total counter
705+
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user1"} 5
706+
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10
707+
`), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total",
708+
"cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total",
709+
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total"))
710+
711+
// Clean up metrics for user1
712+
handler.cleanupMetricsForInactiveUser(user1)
713+
714+
// Verify final state - only user2 should have metrics
715+
require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(`
716+
# HELP cortex_query_seconds_total Total amount of wall clock time spend processing queries.
717+
# TYPE cortex_query_seconds_total counter
718+
cortex_query_seconds_total{source="api",user="user2"} 2
719+
# HELP cortex_query_fetched_series_total Number of series fetched to execute a query.
720+
# TYPE cortex_query_fetched_series_total counter
721+
cortex_query_fetched_series_total{source="api",user="user2"} 200
722+
# HELP cortex_query_samples_total Number of samples fetched to execute a query.
723+
# TYPE cortex_query_samples_total counter
724+
cortex_query_samples_total{source="api",user="user2"} 2000
725+
# HELP cortex_query_samples_scanned_total Number of samples scanned to execute a query.
726+
# TYPE cortex_query_samples_scanned_total counter
727+
cortex_query_samples_scanned_total{source="api",user="user2"} 4000
728+
# HELP cortex_query_peak_samples Highest count of samples considered to execute a query.
729+
# TYPE cortex_query_peak_samples histogram
730+
cortex_query_peak_samples_bucket{source="api",user="user2",le="+Inf"} 1
731+
cortex_query_peak_samples_sum{source="api",user="user2"} 1000
732+
cortex_query_peak_samples_count{source="api",user="user2"} 1
733+
# HELP cortex_query_fetched_chunks_bytes_total Size of all chunks fetched to execute a query in bytes.
734+
# TYPE cortex_query_fetched_chunks_bytes_total counter
735+
cortex_query_fetched_chunks_bytes_total{source="api",user="user2"} 2048
736+
# HELP cortex_query_fetched_data_bytes_total Size of all data fetched to execute a query in bytes.
737+
# TYPE cortex_query_fetched_data_bytes_total counter
738+
cortex_query_fetched_data_bytes_total{source="api",user="user2"} 4096
739+
# HELP cortex_rejected_queries_total The total number of queries that were rejected.
740+
# TYPE cortex_rejected_queries_total counter
741+
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10
742+
`), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total",
743+
"cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total",
744+
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total"))
745+
}

0 commit comments

Comments
 (0)