Skip to content

Commit 22cb8c5

Browse files
esensarpront
andauthored
fix(metrics): prevent negative utilization on late messages (#24073)
* fix(metrics): prevent negative utilization on late messages This prevents late messages from breaking expected utilization bounds, by clamping all late messages to the start of reporting period. Messages arrive in order, so that should not break the expected state (waiting or not waiting). Utilization metric can still be slightly inaccurate on late messages, but it was never meant to be completely accurate. Fixes: #24060 * Add changelog entry * Add more details to changelog * Update src/utilization.rs Co-authored-by: Pavlos Rontidis <[email protected]> * Use `assert_approx_eq` in utilization tests --------- Co-authored-by: Pavlos Rontidis <[email protected]>
1 parent 64aacca commit 22cb8c5

File tree

2 files changed

+54
-4
lines changed

2 files changed

+54
-4
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fixed an issue where utilization could report negative values. This could happen if messages from components were processed too late and were accounted for wrong utilization measurement period. These messages are now moved to the current utilization period, meaning there might be some inaccuracy in the resulting utilization metric, but it was never meant to be precise.
2+
3+
authors: esensar Quad9DNS

src/utilization.rs

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,17 @@ impl Timer {
110110
/// Begin a new span representing time spent waiting
111111
pub(crate) fn start_wait(&mut self, at: Instant) {
112112
if !self.waiting {
113-
self.end_span(at);
113+
// Clamp start time in case of a late message
114+
self.end_span(at.max(self.overall_start));
114115
self.waiting = true;
115116
}
116117
}
117118

118119
/// Complete the current waiting span and begin a non-waiting span
119120
pub(crate) fn stop_wait(&mut self, at: Instant) {
120121
if self.waiting {
121-
self.end_span(at);
122+
// Clamp stop time in case of a late message
123+
self.end_span(at.max(self.overall_start));
122124
self.waiting = false;
123125
}
124126
}
@@ -143,7 +145,7 @@ impl Timer {
143145
self.gauge.set(avg_rounded);
144146

145147
// Reset overall statistics for the next reporting period.
146-
self.overall_start = self.span_start;
148+
self.overall_start = now;
147149
self.total_wait = Duration::new(0, 0);
148150

149151
#[cfg(debug_assertions)]
@@ -162,7 +164,9 @@ impl Timer {
162164

163165
fn end_span(&mut self, at: Instant) {
164166
if self.waiting {
165-
self.total_wait += at - self.span_start;
167+
// `at` can be before span start here, the result will be clamped to 0
168+
// because `duration_since` returns zero if `at` is before span start
169+
self.total_wait += at.duration_since(self.span_start);
166170
}
167171
self.span_start = at;
168172
}
@@ -322,6 +326,49 @@ mod tests {
322326
);
323327
}
324328

329+
#[test]
330+
#[serial]
331+
fn test_utilization_in_bounds_on_late_start() {
332+
let mut timer = setup_timer();
333+
334+
MockClock::advance(Duration::from_secs(5));
335+
336+
timer.update_utilization();
337+
338+
let avg = timer.ewma.average().unwrap();
339+
assert_approx_eq(avg, 1.0, "near 1.0 (never waiting)");
340+
341+
// Late message for start wait
342+
timer.start_wait(Instant::now() - Duration::from_secs(1));
343+
MockClock::advance(Duration::from_secs(5));
344+
345+
timer.update_utilization();
346+
let avg = timer.ewma.average().unwrap();
347+
assert_approx_eq(avg, 0.1, "~0.1");
348+
}
349+
350+
#[test]
351+
#[serial]
352+
fn test_utilization_in_bounds_on_late_stop() {
353+
let mut timer = setup_timer();
354+
355+
MockClock::advance(Duration::from_secs(5));
356+
357+
timer.waiting = true;
358+
timer.update_utilization();
359+
360+
let avg = timer.ewma.average().unwrap();
361+
assert_approx_eq(avg, 0.0, "near 0 (always waiting)");
362+
363+
// Late message for stop wait
364+
timer.stop_wait(Instant::now() - Duration::from_secs(4));
365+
MockClock::advance(Duration::from_secs(5));
366+
367+
timer.update_utilization();
368+
let avg = timer.ewma.average().unwrap();
369+
assert_approx_eq(avg, 0.9, "~0.9");
370+
}
371+
325372
#[test]
326373
#[serial]
327374
fn test_normal_utilization_within_bounds() {

0 commit comments

Comments
 (0)