Skip to content

Refactored retry config into _retry.py and added support for exponential backoff and Retry-After header #871

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: fcm-http2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
253 changes: 253 additions & 0 deletions firebase_admin/_retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
# Copyright 2025 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Internal retry logic module

This module provides utilities for adding retry logic to HTTPX requests
"""

from __future__ import annotations
import copy
import email.utils
import random
import re
import time
from types import CoroutineType
from typing import Any, Callable, List, Optional, Tuple
import logging
import asyncio
import httpx

logger = logging.getLogger(__name__)


class HttpxRetry:
"""HTTPX based retry config"""
# TODO: Decide
# urllib3.Retry ignores the status_forcelist when respecting Retry-After header
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean urllib3.Retry retries other status codes (that's not in status_forcelist) with Retry-After headers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I worded that comment poorly so you can disregard it.

What I wanted to point out that in urllib3.Retry there is no default status codes that are retried if status_forcelist and respect_retry_after_header both are not set. These RETRY_AFTER_STATUS_CODES are only applied if respect_retry_after_header is set.

I decided to do the same in this implementation but still wanted to highlight it because the urllib3 docs were misleading and implied these are used as status_forcelist defaults.

# Only 413, 429 and 503 errors are retried with the Retry-After header.
# Should we do the same?
# Default status codes to be used for ``status_forcelist``
RETRY_AFTER_STATUS_CODES = frozenset([413, 429, 503])

#: Default maximum backoff time.
DEFAULT_BACKOFF_MAX = 120

def __init__(
self,
status: int = 10,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to call this max_retries instead?

Copy link
Contributor Author

@jonathanedey jonathanedey May 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current state yes because status retries are our only retries we have. This was more of a future proof decision for when we added error retires where we would have total, status and error counters.

Similar to the placeholder comment I think we can use max_retries (and retries_left internally since we decrease this value) and add the other options later.

status_forcelist: Optional[List[int]] = None,
backoff_factor: float = 0,
backoff_max: float = DEFAULT_BACKOFF_MAX,
raise_on_status: bool = False,
backoff_jitter: float = 0,
history: Optional[List[Tuple[
httpx.Request,
Optional[httpx.Response],
Optional[Exception]
]]] = None,
respect_retry_after_header: bool = False,
) -> None:
self.status = status
self.status_forcelist = status_forcelist
self.backoff_factor = backoff_factor
self.backoff_max = backoff_max
self.raise_on_status = raise_on_status
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If raise_on_status is unused we should just remove it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

self.backoff_jitter = backoff_jitter
if history:
self.history = history
else:
self.history = []
self.respect_retry_after_header = respect_retry_after_header

def copy(self) -> HttpxRetry:
"""Creates a deep copy of this instance."""
return copy.deepcopy(self)

def is_retryable_response(self, response: httpx.Response) -> bool:
"""Determine if a response implies that the request should be retried if possible."""
if self.status_forcelist and response.status_code in self.status_forcelist:
return True

has_retry_after = bool(response.headers.get("Retry-After"))
if (
self.respect_retry_after_header
and has_retry_after
and response.status_code in self.RETRY_AFTER_STATUS_CODES
):
return True

return False

# Placeholder for exception retrying
def is_retryable_error(self, error: Exception):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is best not to leave placeholders in PRs if we can. If this is currently a no-op we can remove it and add it when we need it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, removed!

"""Determine if the error implies that the request should be retired if possible."""
logger.debug(error)
return False

def is_exhausted(self) -> bool:
"""Determine if there are anymore more retires."""
# status count is negative
return self.status < 0

# Identical implementation of `urllib3.Retry.parse_retry_after()`
def _parse_retry_after(self, retry_after_header: str) -> float | None:
"""Parses Retry-After string into a float with unit seconds."""
seconds: float
# Whitespace: https://tools.ietf.org/html/rfc7230#section-3.2.4
if re.match(r"^\s*[0-9]+\s*$", retry_after_header):
seconds = int(retry_after_header)
else:
retry_date_tuple = email.utils.parsedate_tz(retry_after_header)
if retry_date_tuple is None:
# TODO: Verify if this is the appropriate way to handle this.
raise httpx.RemoteProtocolError(f"Invalid Retry-After header: {retry_after_header}")

retry_date = email.utils.mktime_tz(retry_date_tuple)
seconds = retry_date - time.time()

seconds = max(seconds, 0)

return seconds

def get_retry_after(self, response: httpx.Response) -> float | None:
"""Determine the Retry-After time needed before sending the next request."""
retry_after_header = response.headers.get('Retry-After', None)
if retry_after_header:
# Convert retry header to a float in seconds
return self._parse_retry_after(retry_after_header)
return None

def get_backoff_time(self):
"""Determine the backoff time needed before sending the next request."""
# attempt_count is the number of previous request attempts
attempt_count = len(self.history)
# Backoff should be set to 0 until after first retry.
if attempt_count <= 1:
return 0
backoff = self.backoff_factor * (2 ** (attempt_count-1))
if self.backoff_jitter:
backoff += random.random() * self.backoff_jitter
return float(max(0, min(self.backoff_max, backoff)))

async def sleep_for_backoff(self) -> None:
"""Determine and wait the backoff time needed before sending the next request."""
backoff = self.get_backoff_time()
logger.debug('Sleeping for backoff of %f seconds following failed request', backoff)
await asyncio.sleep(backoff)

async def sleep(self, response: httpx.Response) -> None:
"""Determine and wait the time needed before sending the next request."""
if self.respect_retry_after_header:
retry_after = self.get_retry_after(response)
if retry_after:
logger.debug(
'Sleeping for Retry-After header of %f seconds following failed request',
retry_after
)
await asyncio.sleep(retry_after)
return
await self.sleep_for_backoff()

def increment(
self,
request: httpx.Request,
response: Optional[httpx.Response] = None,
error: Optional[Exception] = None
) -> None:
"""Update the retry state based on request attempt."""
if response and self.is_retryable_response(response):
self.status -= 1
self.history.append((request, response, error))


# TODO: Remove comments
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can remove these. Or add a comment on what it doesn't cover as a TODO for future. I prefer we use a bug/issue to track these instead of TODO comments when possible

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, these todos were mainly markers so as to not miss removing some thought process comments before final merge.

Moving the missing features notes to a bugs.

# Note - This implementation currently covers:
# - basic retires for pre-defined status errors
# - applying retry backoff and backoff jitter
# - ability to respect a response's retry-after header
class HttpxRetryTransport(httpx.AsyncBaseTransport):
"""HTTPX transport with retry logic."""

# DEFAULT_RETRY = HttpxRetry(
# connect=1, read=1, status=4, status_forcelist=[500, 503],
# raise_on_status=False, backoff_factor=0.5, allowed_methods=None
# )
DEFAULT_RETRY = HttpxRetry(status=4, status_forcelist=[500, 503], backoff_factor=0.5)

# We could also support passing kwargs here
def __init__(self, retry: HttpxRetry = DEFAULT_RETRY, **kwargs) -> None:
self._retry = retry

transport_kwargs = kwargs.copy()
transport_kwargs.update({'retries': 0, 'http2': True})
# We should use a full AsyncHTTPTransport under the hood since that is
# fully implemented. We could consider making this class extend a
# AsyncHTTPTransport instead and use the parent class's methods to handle
# requests. We sould also ensure that that transport's internal retry is
# not enabled.
self._wrapped_transport = httpx.AsyncHTTPTransport(**transport_kwargs)

async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
return await self._dispatch_with_retry(
request, self._wrapped_transport.handle_async_request)

# Two types of retries
# - Status code (500s, redirect)
# - Error code (read, connect, other)
async def _dispatch_with_retry(
self,
request: httpx.Request,
dispatch_method: Callable[[httpx.Request], CoroutineType[Any, Any, httpx.Response]]
) -> httpx.Response:
"""Sends a request with retry logic using a provided dispatch method."""
# This request config is used across all requests that use this transport and therefore
# needs to be copied to be used for just this request ans it's retries.
retry = self._retry.copy()
# First request
response, error = None, None

while not retry.is_exhausted():

# First retry
if response:
await retry.sleep(response)

# Need to reset here so only last attempt's error or response is saved.
response, error = None, None

try:
logger.debug('Sending request in _dispatch_with_retry(): %r', request)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we plan to keep these logs in the production code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would want to keep these for future debugging. This would be helpful to catch issues as we iterate. Wdyt?

response = await dispatch_method(request)
logger.debug('Received response: %r', response)
except httpx.HTTPError as err:
logger.debug('Received error: %r', err)
error = err

if response and not retry.is_retryable_response(response):
return response

if error and not retry.is_retryable_error(error):
raise error

retry.increment(request, response)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we pass the error here? retry.increment(request, response, error)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good catch! Fixed


if response:
return response
if error:
raise error
raise Exception('_dispatch_with_retry() ended with no response or exception')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's narrow this down to an Exception type RuntimeError or AssertionError might work better here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's go with AssertionError since this case should never be reached if the logic above is correct.


async def aclose(self) -> None:
await self._wrapped_transport.aclose()
51 changes: 9 additions & 42 deletions firebase_admin/messaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import json
import warnings
import asyncio
import logging
import requests
import httpx

Expand All @@ -38,7 +39,9 @@
exceptions,
App
)
from firebase_admin._retry import HttpxRetryTransport

logger = logging.getLogger(__name__)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to set the log level in production code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Developers should be able to set the logging level in their production apps by using the following code:

import logging

logging.basicConfig()
firebase_admin_logger = logging.getLogger('firebase_admin')
firebase_admin_logger.setLevel(logging.DEBUG)


_MESSAGING_ATTRIBUTE = '_messaging'

Expand Down Expand Up @@ -410,6 +413,9 @@ def auth_flow(self, request: httpx.Request):
# copy original headers
request.headers = _original_headers.copy()
# mutates request headers
logger.debug(
'Refreshing credentials for request attempt %d',
_credential_refresh_attempt + 1)
self.apply_auth_headers(request)

# Continue to perform the request
Expand All @@ -420,6 +426,9 @@ def auth_flow(self, request: httpx.Request):
# on refreshable status codes. Current transport.requests.AuthorizedSession()
# only does this on 401 errors. We should do the same.
if response.status_code in self._refresh_status_codes:
logger.debug(
'Request attempt %d failed due to unauthorized credentials',
_credential_refresh_attempt + 1)
_credential_refresh_attempt += 1
else:
break
Expand Down Expand Up @@ -715,45 +724,3 @@ def _build_fcm_error(cls, error_dict) -> Optional[Callable[..., exceptions.Fireb
fcm_code = detail.get('errorCode')
break
return _MessagingService.FCM_ERROR_TYPES.get(fcm_code) if fcm_code else None


# TODO: Remove comments
# Notes:
# This implementation currently only covers basic retires for pre-defined status errors
class HttpxRetryTransport(httpx.AsyncBaseTransport):
"""HTTPX transport with retry logic."""
# We could also support passing kwargs here
def __init__(self, **kwargs) -> None:
# Hardcoded settings for now
self._retryable_status_codes = (500, 503,)
self._max_retry_count = 4

# - We use a full AsyncHTTPTransport under the hood to make use of it's
# fully implemented `handle_async_request()`.
# - We could consider making the `HttpxRetryTransport`` class extend a
# `AsyncHTTPTransport` instead and use the parent class's methods to handle
# requests.
# - We should also ensure that that transport's internal retry is
# not enabled.
transport_kwargs = kwargs.copy()
transport_kwargs.update({'retries': 0, 'http2': True})
self._wrapped_transport = httpx.AsyncHTTPTransport(**transport_kwargs)


async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
_retry_count = 0

while True:
# Dispatch request
# Let exceptions pass through for now
response = await self._wrapped_transport.handle_async_request(request)

# Check if request is retryable
if response.status_code in self._retryable_status_codes:
_retry_count += 1

# Return if retries exhausted
if _retry_count > self._max_retry_count:
return response
else:
return response
Loading