Skip to content

Commit b5ecab1

Browse files
authored
feat(utils): Add core CircuitBreaker functionality (#74560)
This completes the work, started in #74557 and #74559, of adding a new, class-and-rate-limit-based circuit breaker implementation to the codebase. In this PR, the core `record_error` and `should_allow_request` methods are added to the `CircuitBreaker` class, along with accompaying tests.
1 parent ffb67db commit b5ecab1

File tree

2 files changed

+319
-1
lines changed

2 files changed

+319
-1
lines changed

src/sentry/utils/circuit_breaker2.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212

1313
from django.conf import settings
1414

15-
from sentry.ratelimits.sliding_windows import Quota, RedisSlidingWindowRateLimiter, RequestedQuota
15+
from sentry.ratelimits.sliding_windows import (
16+
GrantedQuota,
17+
Quota,
18+
RedisSlidingWindowRateLimiter,
19+
RequestedQuota,
20+
)
1621

1722
logger = logging.getLogger(__name__)
1823

@@ -182,6 +187,96 @@ def __init__(self, key: str, config: CircuitBreakerConfig):
182187
)
183188
self.recovery_duration = default_recovery_duration
184189

190+
def record_error(self) -> None:
191+
"""
192+
Record a single error towards the breaker's quota, and handle the case where that error puts
193+
us over the limit.
194+
"""
195+
now = int(time.time())
196+
state, seconds_left_in_state = self._get_state_and_remaining_time()
197+
198+
if state == CircuitBreakerState.BROKEN:
199+
assert seconds_left_in_state is not None # mypy appeasement
200+
201+
# If the circuit is BROKEN, and `should_allow_request` is being used correctly, requests
202+
# should be blocked and we shouldn't even be here. That said, maybe there was a race
203+
# condition, so make sure the circuit hasn't just been tripped before crying foul.
204+
seconds_elapsed_in_state = self.broken_state_duration - seconds_left_in_state
205+
if seconds_elapsed_in_state > 5:
206+
logger.warning(
207+
"Attempt to record circuit breaker error while circuit is in BROKEN state",
208+
extra={"key": self.key, "time_in_state": seconds_elapsed_in_state},
209+
)
210+
# We shouldn't have made the request, so don't record the error
211+
return
212+
213+
# Even though we're not checking it during RECOVERY, we track errors in the primary quota as
214+
# well as in the RECOVERY quota because they still happened, and eventually switching back
215+
# to the okay state doesn't make that untrue
216+
quotas = (
217+
[self.primary_quota, self.recovery_quota]
218+
if state == CircuitBreakerState.RECOVERY
219+
else [self.primary_quota]
220+
)
221+
self.limiter.use_quotas(
222+
[RequestedQuota(self.key, 1, quotas)], [GrantedQuota(self.key, 1, [])], now
223+
)
224+
225+
# If incrementing has made us hit the current limit, switch to the BROKEN state
226+
controlling_quota = self._get_controlling_quota(state)
227+
remaining_errors_allowed = self._get_remaining_error_quota(controlling_quota)
228+
if remaining_errors_allowed == 0:
229+
logger.warning(
230+
"Circuit breaker '%s' error limit hit",
231+
self.key,
232+
extra={
233+
"current_state": state,
234+
"error_limit": controlling_quota.limit,
235+
"error_limit_window": controlling_quota.window_seconds,
236+
},
237+
)
238+
239+
# RECOVERY will only start after the BROKEN state has expired, so push out the RECOVERY
240+
# expiry time. We'll store the expiry times as our redis values so we can determine how
241+
# long we've been in a given state.
242+
broken_state_timeout = self.broken_state_duration
243+
recovery_state_timeout = self.broken_state_duration + self.recovery_duration
244+
broken_state_expiry = now + broken_state_timeout
245+
recovery_state_expiry = now + recovery_state_timeout
246+
247+
# Set reids keys for switching state. While they're both set (starting now) we'll be in
248+
# the BROKEN state. Once `broken_state_key` expires in redis we'll switch to RECOVERY,
249+
# and then once `recovery_state_key` expires we'll be back to normal.
250+
try:
251+
self._set_in_redis(
252+
[
253+
(self.broken_state_key, broken_state_expiry, broken_state_timeout),
254+
(self.recovery_state_key, recovery_state_expiry, recovery_state_timeout),
255+
]
256+
)
257+
258+
# If redis errors, stay in the current state
259+
except Exception:
260+
logger.exception(
261+
"Couldn't set state-change keys in redis for circuit breaker '%s'",
262+
self.key,
263+
extra={"current_state": state},
264+
)
265+
266+
def should_allow_request(self) -> bool:
267+
"""
268+
Determine, based on the current state of the breaker and the number of allowable errors
269+
remaining, whether requests should be allowed through.
270+
"""
271+
state, _ = self._get_state_and_remaining_time()
272+
273+
if state == CircuitBreakerState.BROKEN:
274+
return False
275+
276+
controlling_quota = self._get_controlling_quota(state)
277+
278+
return self._get_remaining_error_quota(controlling_quota) > 0
279+
185280
def _get_from_redis(self, keys: list[str]) -> Any:
186281
for key in keys:
187282
self.redis_pipeline.get(key)

tests/sentry/utils/test_circuit_breaker2.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from unittest import TestCase
44
from unittest.mock import ANY, MagicMock, patch
55

6+
import time_machine
67
from django.conf import settings
78
from redis.client import Pipeline
89

@@ -318,3 +319,225 @@ def test_fixes_mismatched_state_durations(self, mock_logger: MagicMock):
318319
500,
319320
)
320321
assert breaker.recovery_duration == 500
322+
323+
324+
@freeze_time()
325+
class RecordErrorTest(TestCase):
326+
def setUp(self) -> None:
327+
self.config = DEFAULT_CONFIG
328+
self.breaker = MockCircuitBreaker("dogs_are_great", self.config)
329+
330+
# Clear all existing keys from redis
331+
self.breaker.redis_pipeline.flushall()
332+
self.breaker.redis_pipeline.execute()
333+
334+
def test_increments_error_count(self):
335+
config = self.config
336+
breaker = self.breaker
337+
338+
# The breaker starts with a clean slate
339+
assert breaker._get_remaining_error_quota() == config["error_limit"]
340+
341+
breaker.record_error()
342+
343+
# The error has been tallied
344+
assert breaker._get_remaining_error_quota() == config["error_limit"] - 1
345+
346+
def test_no_error_recorded_in_broken_state(self):
347+
breaker = self.breaker
348+
349+
breaker._set_breaker_state(CircuitBreakerState.BROKEN)
350+
breaker._add_quota_usage(breaker.primary_quota, breaker.error_limit)
351+
352+
# Because we're in the BROKEN state, we start with the main quota maxed out and the
353+
# RECOVERY quota yet to be used
354+
assert breaker._get_remaining_error_quota(breaker.primary_quota) == 0
355+
assert (
356+
breaker._get_remaining_error_quota(breaker.recovery_quota)
357+
== breaker.recovery_error_limit
358+
)
359+
360+
breaker.record_error()
361+
362+
# Neither quota is incremented
363+
assert breaker._get_remaining_error_quota(breaker.primary_quota) == 0
364+
assert (
365+
breaker._get_remaining_error_quota(breaker.recovery_quota)
366+
== breaker.recovery_error_limit
367+
)
368+
369+
@patch("sentry.utils.circuit_breaker2.logger")
370+
def test_logs_a_warning_in_broken_state(self, mock_logger: MagicMock):
371+
breaker = self.breaker
372+
373+
seconds_ellapsed_since_circuit_break = 2
374+
breaker._set_breaker_state(
375+
CircuitBreakerState.BROKEN,
376+
seconds_left=breaker.broken_state_duration - seconds_ellapsed_since_circuit_break,
377+
)
378+
379+
breaker.record_error()
380+
381+
# No log - we just switched into BROKEN state, and even though we're not supposed to land in
382+
# the `record_error` method in that state, there's a small buffer to account for race
383+
# conditions
384+
assert mock_logger.warning.call_count == 0
385+
386+
seconds_ellapsed_since_circuit_break = 20
387+
breaker._set_breaker_state(
388+
CircuitBreakerState.BROKEN,
389+
seconds_left=breaker.broken_state_duration - seconds_ellapsed_since_circuit_break,
390+
)
391+
392+
breaker.record_error()
393+
394+
# Now we do log a warning, because at this point we can no longer blame a race condition -
395+
# it's been too long since the circuit broke
396+
mock_logger.warning.assert_called_with(
397+
"Attempt to record circuit breaker error while circuit is in BROKEN state",
398+
extra={"key": "dogs_are_great", "time_in_state": 20},
399+
)
400+
401+
@patch("sentry.utils.circuit_breaker2.logger")
402+
def test_handles_hitting_max_errors_in_non_broken_state(self, mock_logger: MagicMock):
403+
config = self.config
404+
breaker = self.breaker
405+
now = int(time.time())
406+
407+
for state, quota, limit in [
408+
(CircuitBreakerState.OK, breaker.primary_quota, breaker.error_limit),
409+
(CircuitBreakerState.RECOVERY, breaker.recovery_quota, breaker.recovery_error_limit),
410+
]:
411+
412+
breaker._set_breaker_state(state)
413+
breaker._add_quota_usage(quota, limit - 1)
414+
assert breaker._get_remaining_error_quota(quota) == 1
415+
assert breaker._get_controlling_quota() == quota
416+
417+
breaker.record_error()
418+
419+
# Hitting the limit puts us into the BROKEN state
420+
assert breaker._get_remaining_error_quota(quota) == 0
421+
assert breaker._get_controlling_quota() is None
422+
assert breaker._get_state_and_remaining_time() == (
423+
CircuitBreakerState.BROKEN,
424+
breaker.broken_state_duration,
425+
)
426+
mock_logger.warning.assert_called_with(
427+
"Circuit breaker '%s' error limit hit",
428+
"dogs_are_great",
429+
extra={
430+
"current_state": state,
431+
"error_limit": limit,
432+
"error_limit_window": config["error_limit_window"],
433+
},
434+
)
435+
436+
# Now jump to one second after the BROKEN state has expired to see that we're in
437+
# RECOVERY
438+
with time_machine.travel(now + breaker.broken_state_duration + 1, tick=False):
439+
assert breaker._get_controlling_quota() is breaker.recovery_quota
440+
assert breaker._get_state_and_remaining_time() == (
441+
CircuitBreakerState.RECOVERY,
442+
breaker.recovery_duration - 1,
443+
)
444+
445+
@patch("sentry.utils.circuit_breaker2.logger")
446+
def test_stays_in_current_state_if_redis_call_changing_state_fails(
447+
self, mock_logger: MagicMock
448+
):
449+
breaker = self.breaker
450+
451+
for current_state, quota, limit, seconds_left in [
452+
# The case where the current state is the BROKEN state isn't included here because the
453+
# switch from BROKEN state to RECOVERY state happens passively (by `broken_state_key`
454+
# expiring), rather than through an active call to redis
455+
(
456+
CircuitBreakerState.OK,
457+
breaker.primary_quota,
458+
breaker.error_limit,
459+
None,
460+
),
461+
(
462+
CircuitBreakerState.RECOVERY,
463+
breaker.recovery_quota,
464+
breaker.recovery_error_limit,
465+
1231,
466+
),
467+
]:
468+
469+
breaker._set_breaker_state(current_state, seconds_left)
470+
breaker._add_quota_usage(quota, limit - 1)
471+
assert breaker._get_remaining_error_quota(quota) == 1
472+
assert breaker._get_controlling_quota() == quota
473+
474+
with patch(
475+
"sentry.utils.circuit_breaker2.CircuitBreaker._set_in_redis", side_effect=Exception
476+
):
477+
breaker.record_error()
478+
479+
# We've recorded the error, but the state hasn't changed
480+
assert breaker._get_remaining_error_quota(quota) == 0
481+
assert breaker._get_controlling_quota() == quota
482+
assert breaker._get_state_and_remaining_time() == (current_state, seconds_left)
483+
mock_logger.exception.assert_called_with(
484+
"Couldn't set state-change keys in redis for circuit breaker '%s'",
485+
breaker.key,
486+
extra={"current_state": current_state},
487+
)
488+
489+
490+
@freeze_time()
491+
class ShouldAllowRequestTest(TestCase):
492+
def setUp(self) -> None:
493+
self.config = DEFAULT_CONFIG
494+
self.breaker = MockCircuitBreaker("dogs_are_great", self.config)
495+
496+
# Clear all existing keys from redis
497+
self.breaker.redis_pipeline.flushall()
498+
self.breaker.redis_pipeline.execute()
499+
500+
def test_allows_request_in_non_broken_state_with_quota_remaining(self):
501+
breaker = self.breaker
502+
503+
for state, quota, limit in [
504+
(CircuitBreakerState.OK, breaker.primary_quota, breaker.error_limit),
505+
(CircuitBreakerState.RECOVERY, breaker.recovery_quota, breaker.recovery_error_limit),
506+
]:
507+
breaker._set_breaker_state(state)
508+
breaker._add_quota_usage(quota, limit - 5)
509+
assert breaker._get_remaining_error_quota(quota) == 5
510+
511+
assert breaker.should_allow_request() is True
512+
513+
def test_blocks_request_in_non_broken_state_with_no_quota_remaining(self):
514+
breaker = self.breaker
515+
516+
for state, quota, limit in [
517+
(CircuitBreakerState.OK, breaker.primary_quota, breaker.error_limit),
518+
(CircuitBreakerState.RECOVERY, breaker.recovery_quota, breaker.recovery_error_limit),
519+
]:
520+
breaker._set_breaker_state(state)
521+
breaker._add_quota_usage(quota, limit)
522+
assert breaker._get_remaining_error_quota(quota) == 0
523+
524+
assert breaker.should_allow_request() is False
525+
526+
def test_blocks_request_in_BROKEN_state(self):
527+
breaker = self.breaker
528+
529+
breaker._set_breaker_state(CircuitBreakerState.BROKEN)
530+
531+
assert breaker.should_allow_request() is False
532+
533+
@patch("sentry.utils.circuit_breaker2.logger")
534+
def test_allows_request_if_redis_call_fails(self, mock_logger: MagicMock):
535+
breaker = self.breaker
536+
537+
with patch(
538+
"sentry.utils.circuit_breaker2.CircuitBreaker._get_from_redis", side_effect=Exception
539+
):
540+
assert breaker.should_allow_request() is True
541+
mock_logger.exception.assert_called_with(
542+
"Couldn't get state from redis for circuit breaker '%s'", breaker.key
543+
)

0 commit comments

Comments
 (0)