Skip to content

Commit 299908e

Browse files
authored
Retry GCP requests on server error (#2243)
* Retry GCP requests on server error * Also retry OAuth * Lower default backoff configuration * Add retry disclaimer * Add retry_timeout * Add logging * Fix features
1 parent b826162 commit 299908e

File tree

8 files changed

+328
-24
lines changed

8 files changed

+328
-24
lines changed

object_store/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ quick-xml = { version = "0.23.0", features = ["serialize"], optional = true }
4848
rustls-pemfile = { version = "1.0", default-features = false, optional = true }
4949
ring = { version = "0.16", default-features = false, features = ["std"] }
5050
base64 = { version = "0.13", default-features = false, optional = true }
51+
rand = { version = "0.8", default-features = false, optional = true, features = ["std", "std_rng"] }
5152
# for rusoto
5253
hyper = { version = "0.14", optional = true, default-features = false }
5354
# for rusoto
@@ -58,7 +59,7 @@ percent-encoding = "2.1"
5859
rusoto_core = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] }
5960
rusoto_credential = { version = "0.48.0", optional = true, default-features = false }
6061
rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] }
61-
rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] }
62+
rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] }
6263
snafu = "0.7"
6364
tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] }
6465
tracing = { version = "0.1" }
@@ -71,7 +72,7 @@ walkdir = "2"
7172
[features]
7273
azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"]
7374
azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"]
74-
gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"]
75+
gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand"]
7576
aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"]
7677

7778
[dev-dependencies] # In alphabetical order

object_store/src/client/backoff.rs

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use rand::prelude::*;
19+
use std::time::Duration;
20+
21+
/// Exponential backoff with jitter
22+
///
23+
/// See <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
24+
#[allow(missing_copy_implementations)]
25+
#[derive(Debug, Clone)]
26+
pub struct BackoffConfig {
27+
/// The initial backoff duration
28+
pub init_backoff: Duration,
29+
/// The maximum backoff duration
30+
pub max_backoff: Duration,
31+
/// The base of the exponential to use
32+
pub base: f64,
33+
}
34+
35+
impl Default for BackoffConfig {
36+
fn default() -> Self {
37+
Self {
38+
init_backoff: Duration::from_millis(100),
39+
max_backoff: Duration::from_secs(15),
40+
base: 2.,
41+
}
42+
}
43+
}
44+
45+
/// [`Backoff`] can be created from a [`BackoffConfig`]
46+
///
47+
/// Consecutive calls to [`Backoff::next`] will return the next backoff interval
48+
///
49+
pub struct Backoff {
50+
init_backoff: f64,
51+
next_backoff_secs: f64,
52+
max_backoff_secs: f64,
53+
base: f64,
54+
rng: Option<Box<dyn RngCore + Sync + Send>>,
55+
}
56+
57+
impl std::fmt::Debug for Backoff {
58+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59+
f.debug_struct("Backoff")
60+
.field("init_backoff", &self.init_backoff)
61+
.field("next_backoff_secs", &self.next_backoff_secs)
62+
.field("max_backoff_secs", &self.max_backoff_secs)
63+
.field("base", &self.base)
64+
.finish()
65+
}
66+
}
67+
68+
impl Backoff {
69+
/// Create a new [`Backoff`] from the provided [`BackoffConfig`]
70+
pub fn new(config: &BackoffConfig) -> Self {
71+
Self::new_with_rng(config, None)
72+
}
73+
74+
/// Creates a new `Backoff` with the optional `rng`
75+
///
76+
/// Used [`rand::thread_rng()`] if no rng provided
77+
pub fn new_with_rng(
78+
config: &BackoffConfig,
79+
rng: Option<Box<dyn RngCore + Sync + Send>>,
80+
) -> Self {
81+
let init_backoff = config.init_backoff.as_secs_f64();
82+
Self {
83+
init_backoff,
84+
next_backoff_secs: init_backoff,
85+
max_backoff_secs: config.max_backoff.as_secs_f64(),
86+
base: config.base,
87+
rng,
88+
}
89+
}
90+
91+
/// Returns the next backoff duration to wait for
92+
pub fn next(&mut self) -> Duration {
93+
let range = self.init_backoff..(self.next_backoff_secs * self.base);
94+
95+
let rand_backoff = match self.rng.as_mut() {
96+
Some(rng) => rng.gen_range(range),
97+
None => thread_rng().gen_range(range),
98+
};
99+
100+
let next_backoff = self.max_backoff_secs.min(rand_backoff);
101+
Duration::from_secs_f64(std::mem::replace(
102+
&mut self.next_backoff_secs,
103+
next_backoff,
104+
))
105+
}
106+
}
107+
108+
#[cfg(test)]
109+
mod tests {
110+
use super::*;
111+
use rand::rngs::mock::StepRng;
112+
113+
#[test]
114+
fn test_backoff() {
115+
let init_backoff_secs = 1.;
116+
let max_backoff_secs = 500.;
117+
let base = 3.;
118+
119+
let config = BackoffConfig {
120+
init_backoff: Duration::from_secs_f64(init_backoff_secs),
121+
max_backoff: Duration::from_secs_f64(max_backoff_secs),
122+
base,
123+
};
124+
125+
let assert_fuzzy_eq =
126+
|a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{} != {}", a, b);
127+
128+
// Create a static rng that takes the minimum of the range
129+
let rng = Box::new(StepRng::new(0, 0));
130+
let mut backoff = Backoff::new_with_rng(&config, Some(rng));
131+
132+
for _ in 0..20 {
133+
assert_eq!(backoff.next().as_secs_f64(), init_backoff_secs);
134+
}
135+
136+
// Create a static rng that takes the maximum of the range
137+
let rng = Box::new(StepRng::new(u64::MAX, 0));
138+
let mut backoff = Backoff::new_with_rng(&config, Some(rng));
139+
140+
for i in 0..20 {
141+
let value = (base.powi(i) * init_backoff_secs).min(max_backoff_secs);
142+
assert_fuzzy_eq(backoff.next().as_secs_f64(), value);
143+
}
144+
145+
// Create a static rng that takes the mid point of the range
146+
let rng = Box::new(StepRng::new(u64::MAX / 2, 0));
147+
let mut backoff = Backoff::new_with_rng(&config, Some(rng));
148+
149+
let mut value = init_backoff_secs;
150+
for _ in 0..20 {
151+
assert_fuzzy_eq(backoff.next().as_secs_f64(), value);
152+
value = (init_backoff_secs + (value * base - init_backoff_secs) / 2.)
153+
.min(max_backoff_secs);
154+
}
155+
}
156+
}

object_store/src/client/mod.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Generic utilities reqwest based ObjectStore implementations
19+
20+
pub mod backoff;
21+
pub mod oauth;
22+
pub mod retry;
23+
pub mod token;

object_store/src/oauth.rs renamed to object_store/src/client/oauth.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::token::TemporaryToken;
18+
use crate::client::retry::RetryExt;
19+
use crate::client::token::TemporaryToken;
20+
use crate::RetryConfig;
1921
use reqwest::{Client, Method};
2022
use ring::signature::RsaKeyPair;
2123
use snafu::{ResultExt, Snafu};
@@ -133,7 +135,11 @@ impl OAuthProvider {
133135
}
134136

135137
/// Fetch a fresh token
136-
pub async fn fetch_token(&self, client: &Client) -> Result<TemporaryToken<String>> {
138+
pub async fn fetch_token(
139+
&self,
140+
client: &Client,
141+
retry: &RetryConfig,
142+
) -> Result<TemporaryToken<String>> {
137143
let now = seconds_since_epoch();
138144
let exp = now + 3600;
139145

@@ -168,7 +174,7 @@ impl OAuthProvider {
168174
let response: TokenResponse = client
169175
.request(Method::POST, &self.audience)
170176
.form(&body)
171-
.send()
177+
.send_retry(retry)
172178
.await
173179
.context(TokenRequestSnafu)?
174180
.error_for_status()

object_store/src/client/retry.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! A shared HTTP client implementation incorporating retries
19+
20+
use crate::client::backoff::{Backoff, BackoffConfig};
21+
use futures::future::BoxFuture;
22+
use futures::FutureExt;
23+
use reqwest::{Response, Result};
24+
use std::time::{Duration, Instant};
25+
use tracing::info;
26+
27+
/// Contains the configuration for how to respond to server errors
28+
///
29+
/// By default they will be retried up to some limit, using exponential
30+
/// backoff with jitter. See [`BackoffConfig`] for more information
31+
///
32+
#[derive(Debug, Clone)]
33+
pub struct RetryConfig {
34+
/// The backoff configuration
35+
pub backoff: BackoffConfig,
36+
37+
/// The maximum number of times to retry a request
38+
///
39+
/// Set to 0 to disable retries
40+
pub max_retries: usize,
41+
42+
/// The maximum length of time from the initial request
43+
/// after which no further retries will be attempted
44+
///
45+
/// This not only bounds the length of time before a server
46+
/// error will be surfaced to the application, but also bounds
47+
/// the length of time a request's credentials must remain valid.
48+
///
49+
/// As requests are retried without renewing credentials or
50+
/// regenerating request payloads, this number should be kept
51+
/// below 5 minutes to avoid errors due to expired credentials
52+
/// and/or request payloads
53+
pub retry_timeout: Duration,
54+
}
55+
56+
impl Default for RetryConfig {
57+
fn default() -> Self {
58+
Self {
59+
backoff: Default::default(),
60+
max_retries: 10,
61+
retry_timeout: Duration::from_secs(3 * 60),
62+
}
63+
}
64+
}
65+
66+
pub trait RetryExt {
67+
/// Dispatch a request with the given retry configuration
68+
///
69+
/// # Panic
70+
///
71+
/// This will panic if the request body is a stream
72+
fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result<Response>>;
73+
}
74+
75+
impl RetryExt for reqwest::RequestBuilder {
76+
fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result<Response>> {
77+
let mut backoff = Backoff::new(&config.backoff);
78+
let max_retries = config.max_retries;
79+
let retry_timeout = config.retry_timeout;
80+
81+
async move {
82+
let mut retries = 0;
83+
let now = Instant::now();
84+
85+
loop {
86+
let s = self.try_clone().expect("request body must be cloneable");
87+
match s.send().await {
88+
Err(e)
89+
if retries < max_retries
90+
&& now.elapsed() < retry_timeout
91+
&& e.status()
92+
.map(|s| s.is_server_error())
93+
.unwrap_or(false) =>
94+
{
95+
let sleep = backoff.next();
96+
retries += 1;
97+
info!("Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), retries, max_retries);
98+
tokio::time::sleep(sleep).await;
99+
}
100+
r => return r,
101+
}
102+
}
103+
}
104+
.boxed()
105+
}
106+
}
File renamed without changes.

0 commit comments

Comments
 (0)