Skip to content

Commit 1c6769a

Browse files
authored
ci: add setup downloads retries (#15398) (#15597)
## Description Backport of #15398 - Added exponential backoff retry decorator to Python downloads (libddwaf binaries, dedup_headers tool) with 10 retries and configurable delays - Increased Cargo retry count from 3 to 10 for all Rust dependency downloads - Added 10-minute download timeout + 2-minute inactivity timeout to all CMake FetchContent operations - Made retry behavior tunable via DD_DOWNLOAD_MAX_RETRIES, DD_DOWNLOAD_INITIAL_DELAY, and DD_DOWNLOAD_MAX_DELAY environment variables - Automatically retries on HTTP 429 (rate limit), 502/503/504 (server errors), and network timeouts
1 parent a101082 commit 1c6769a

File tree

8 files changed

+166
-22
lines changed

8 files changed

+166
-22
lines changed

ddtrace/appsec/_iast/_taint_tracking/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,11 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
4949
add_definitions(-DDONT_COMPILE_ABSEIL) # Define DONT_COMPILE_ABSEIL preprocessor variable
5050
else()
5151
message("Release, RelWithDebInfo, or MinSizeRel mode: using abseil (DD_COMPILE_ABSEIL unset or not 0/false)")
52-
FetchContent_Declare(absl URL "https://github.com/abseil/abseil-cpp/archive/refs/tags/20250127.1.zip")
52+
FetchContent_Declare(
53+
absl
54+
URL "https://github.com/abseil/abseil-cpp/archive/refs/tags/20250127.1.zip"
55+
TIMEOUT 180
56+
INACTIVITY_TIMEOUT 120 DOWNLOAD_EXTRACT_TIMESTAMP TRUE)
5357
FetchContent_MakeAvailable(absl)
5458
endif()
5559

ddtrace/appsec/_iast/_taint_tracking/tests/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
cmake_minimum_required(VERSION 3.19)
22

33
include(FetchContent)
4-
FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.16.0.zip)
4+
FetchContent_Declare(
5+
googletest
6+
URL https://github.com/google/googletest/archive/refs/tags/v1.16.0.zip
7+
TIMEOUT 180
8+
INACTIVITY_TIMEOUT 120)
59
FetchContent_MakeAvailable(googletest)
610

711
enable_testing()

ddtrace/internal/datadog/profiling/dd_wrapper/test/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ include(FetchContent)
22
FetchContent_Declare(
33
googletest
44
GIT_REPOSITORY https://github.com/google/googletest.git
5-
GIT_TAG v1.15.2)
5+
GIT_TAG v1.15.2
6+
TIMEOUT 180
7+
INACTIVITY_TIMEOUT 120)
68
set(gtest_force_shared_crt
79
ON
810
CACHE BOOL "" FORCE)

ddtrace/internal/datadog/profiling/stack_v2/test/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
FetchContent_Declare(
22
googletest
33
GIT_REPOSITORY https://github.com/google/googletest.git
4-
GIT_TAG v1.15.2)
4+
GIT_TAG v1.15.2
5+
TIMEOUT 180
6+
INACTIVITY_TIMEOUT 120)
57
set(gtest_force_shared_crt
68
ON
79
CACHE BOOL "" FORCE)

docs/build_system.rst

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,38 @@ These environment variables modify aspects of the build process.
224224

225225
version_added:
226226
v3.10.0:
227+
228+
DD_DOWNLOAD_MAX_RETRIES:
229+
type: Integer
230+
default: 10
231+
232+
description: |
233+
Maximum number of retry attempts for transient download failures from GitHub.
234+
Retries are triggered by HTTP 429 (rate limit), 502/503/504 (server errors),
235+
and network timeouts. Uses exponential backoff with jitter between retries.
236+
237+
version_added:
238+
v4.1.0:
239+
240+
DD_DOWNLOAD_INITIAL_DELAY:
241+
type: Float
242+
default: 1.0
243+
244+
description: |
245+
Initial delay in seconds before the first retry attempt.
246+
Delay increases exponentially with backoff_factor=1.618 (Fibonacci-like).
247+
Useful for tuning retry behavior in different environments.
248+
249+
version_added:
250+
v4.1.0:
251+
252+
DD_DOWNLOAD_MAX_DELAY:
253+
type: Integer
254+
default: 120
255+
256+
description: |
257+
Maximum delay in seconds between retry attempts.
258+
Prevents excessive wait times during exponential backoff.
259+
260+
version_added:
261+
v4.1.0:

docs/spelling_wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ AWS
4040
ARN
4141
backend
4242
backends
43+
backoff
4344
backport
4445
backported
4546
backporting

setup.py

Lines changed: 101 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from itertools import chain
44
import os
55
import platform
6+
import random
67
import re
78
import shutil
89
import subprocess
@@ -41,7 +42,9 @@
4142
"https://ddtrace.readthedocs.io/en/stable/installation_quickstart.html"
4243
)
4344

45+
from functools import wraps
4446
from urllib.error import HTTPError
47+
from urllib.error import URLError
4548
from urllib.request import urlretrieve
4649

4750

@@ -84,6 +87,11 @@
8487

8588
SCCACHE_COMPILE = os.getenv("DD_USE_SCCACHE", "0").lower() in ("1", "yes", "on", "true")
8689

90+
# Retry configuration for downloads (handles GitHub API failures like 503, 429)
91+
DOWNLOAD_MAX_RETRIES = int(os.getenv("DD_DOWNLOAD_MAX_RETRIES", "10"))
92+
DOWNLOAD_INITIAL_DELAY = float(os.getenv("DD_DOWNLOAD_INITIAL_DELAY", "1.0"))
93+
DOWNLOAD_MAX_DELAY = float(os.getenv("DD_DOWNLOAD_MAX_DELAY", "120"))
94+
8795
IS_PYSTON = hasattr(sys, "pyston_version_info")
8896
IS_EDITABLE = False # Set to True if the package is being installed in editable mode
8997

@@ -139,6 +147,71 @@ def interpose_sccache():
139147
os.environ["CXX"] = str(sccache_path) + " " + str(cxx_path)
140148

141149

150+
def retry_download(
151+
max_attempts=DOWNLOAD_MAX_RETRIES,
152+
initial_delay=DOWNLOAD_INITIAL_DELAY,
153+
max_delay=DOWNLOAD_MAX_DELAY,
154+
backoff_factor=1.618,
155+
):
156+
"""
157+
Decorator to retry downloads with exponential backoff.
158+
Handles HTTP 503, 429, network errors from GitHub API, and cargo install failures.
159+
Retriable errors: HTTP 429 (rate limit), 502, 503, 504, network timeouts, and subprocess errors.
160+
"""
161+
162+
def decorator(func):
163+
@wraps(func)
164+
def wrapper(*args, **kwargs):
165+
for attempt in range(max_attempts):
166+
try:
167+
return func(*args, **kwargs)
168+
except (HTTPError, URLError, TimeoutError, OSError, subprocess.CalledProcessError) as e:
169+
# Check if it's a retriable error
170+
is_retriable = False
171+
if isinstance(e, HTTPError):
172+
# Retry on 429 (rate limit), 502/503/504 (server errors)
173+
is_retriable = e.code in (429, 502, 503, 504)
174+
error_code = f"HTTP {e.code}"
175+
elif isinstance(e, (URLError, TimeoutError)):
176+
# Retry on network errors and timeouts
177+
is_retriable = True
178+
error_code = type(e).__name__
179+
elif isinstance(e, OSError):
180+
# Retry on connection errors
181+
is_retriable = True
182+
error_code = type(e).__name__
183+
elif isinstance(e, subprocess.CalledProcessError):
184+
# Retry on subprocess errors (e.g., cargo install network failures)
185+
# These often indicate temporary network issues
186+
is_retriable = True
187+
error_code = f"subprocess exit code {e.returncode}"
188+
else:
189+
error_code = type(e).__name__
190+
191+
if not is_retriable:
192+
print(f"ERROR: Operation failed (non-retriable {error_code}): {e}")
193+
raise
194+
195+
if attempt == max_attempts - 1:
196+
print(f"ERROR: Operation failed after {max_attempts} attempts (last error: {error_code})")
197+
raise
198+
199+
# Calculate delay with jitter
200+
delay = min(initial_delay * (backoff_factor**attempt), max_delay)
201+
jitter = random.uniform(0, delay * 0.1)
202+
total_delay = delay + jitter
203+
204+
print(f"WARNING: Operation failed (attempt {attempt + 1}/{max_attempts}): {error_code} - {e}")
205+
print(f" Retrying in {total_delay:.1f} seconds...")
206+
time.sleep(total_delay)
207+
208+
return func(*args, **kwargs)
209+
210+
return wrapper
211+
212+
return decorator
213+
214+
142215
def verify_checksum_from_file(sha256_filename, filename):
143216
# sha256 File format is ``checksum`` followed by two whitespaces, then ``filename`` then ``\n``
144217
expected_checksum, expected_filename = list(filter(None, open(sha256_filename, "r").read().strip().split(" ")))
@@ -298,18 +371,24 @@ def is_installed(self, bin_file):
298371
def install_dedup_headers(self):
299372
"""Install dedup_headers if not already installed."""
300373
if not self.is_installed("dedup_headers"):
301-
subprocess.run(
302-
[
303-
"cargo",
304-
"install",
305-
"--git",
306-
"https://github.com/DataDog/libdatadog",
307-
"--bin",
308-
"dedup_headers",
309-
"tools",
310-
],
311-
check=True,
312-
)
374+
# Create retry-wrapped cargo install function
375+
@retry_download(max_attempts=DOWNLOAD_MAX_RETRIES, initial_delay=2.0)
376+
def cargo_install_with_retry():
377+
"""Run cargo install with retry on network failures."""
378+
subprocess.run(
379+
[
380+
"cargo",
381+
"install",
382+
"--git",
383+
"https://github.com/DataDog/libdatadog",
384+
"--bin",
385+
"dedup_headers",
386+
"tools",
387+
],
388+
check=True,
389+
)
390+
391+
cargo_install_with_retry()
313392

314393
def run(self):
315394
"""Run the build process with additional post-processing."""
@@ -411,16 +490,20 @@ def download_artifacts(cls):
411490
if not (cls.USE_CACHE and download_dest.exists()):
412491
print(f"Downloading {archive_name} to {download_dest}")
413492
start_ns = time.time_ns()
414-
try:
415-
filename, _ = urlretrieve(download_address, str(download_dest))
416-
except HTTPError as e:
417-
print("No archive found for dynamic library {}: {}".format(cls.name, archive_dir))
418-
raise e
493+
494+
# Create retry-wrapped download function
495+
@retry_download()
496+
def download_file(url, dest):
497+
"""Download file with automatic retry on transient errors."""
498+
return urlretrieve(url, str(dest))
499+
500+
filename, _ = download_file(download_address, download_dest)
419501

420502
# Verify checksum of downloaded file
421503
if cls.expected_checksums is None:
422504
sha256_address = download_address + ".sha256"
423-
sha256_filename, _ = urlretrieve(sha256_address, str(download_dest) + ".sha256")
505+
sha256_dest = str(download_dest) + ".sha256"
506+
sha256_filename, _ = download_file(sha256_address, sha256_dest)
424507
verify_checksum_from_file(sha256_filename, str(download_dest))
425508
else:
426509
expected_checksum = cls.expected_checksums[CURRENT_OS][arch]

src/native/.cargo/config.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,16 @@ rustflags = ["-C", "target-feature=-crt-static"]
1111

1212
[target.aarch64-unknown-linux-musl]
1313
rustflags = ["-C", "target-feature=-crt-static"]
14+
15+
[net]
16+
# Increase retries for GitHub API failures (default is 3)
17+
# Handles HTTP 503, 429, and other transient errors
18+
retry = 10
19+
20+
# Use system git for better reliability and retry handling
21+
git-fetch-with-cli = true
22+
23+
[http]
24+
# Timeout for HTTP operations (3 minutes)
25+
# Prevents hanging on slow/unresponsive servers
26+
timeout = 180

0 commit comments

Comments
 (0)