Skip to content

Commit 33ab2a7

Browse files
authored
Add job-specific timeouts to GHA test jobs (#20730)
## Summary Adds timeout commands to all CI test scripts to prevent indefinite hangs, following the pattern established in rapidsai/cuml#7533. ## Changes Adds `timeout` commands to test execution in 13 CI scripts: - C++ tests: 30m (run_cudf_ctests, run_cudf_examples, run_cudf_kafka_ctests, run_cudf_benchmark_smoketests) - C++ memcheck: 2h (observed max: 70 min) - Java tests: 30m (observed max: 9 min) - Python cudf tests: 40m each (observed max: 14 min) - Python other tests: 30m each (observed max: 13 min) - Wheel cudf tests: 30m each (observed max: 14 min) - Wheel cudf-polars tests: 1h (observed max: 30 min) - Wheel dask-cudf tests: 15m (observed max: 6 min) - cudf-polars polars tests: 30m (observed max: 13 min) - cudf-polars with rapidsmpf: 15m (observed max: 6 min) - narwhals tests: 15m each (observed max: 4 min) - notebooks tests: 10m per notebook (observed max: 3 min) - third-party integration tests: 45m (observed max: 23 min) ## Timeout Selection Timeout values are set at 2-3x observed runtimes from recent successful test runs (analyzed from run 19673854267 and others), providing sufficient safety margin while preventing resource waste from hung tests. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Gil Forsyth (https://github.com/gforsyth) URL: #20730
1 parent 1ee5878 commit 33ab2a7

13 files changed

+34
-25
lines changed

ci/cudf_pandas_scripts/third-party-integration/test.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ main() {
8585
trap "EXITCODE=1" ERR
8686
set +e
8787

88-
TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh "${lib}"
88+
TEST_DIR=${TEST_DIR} \
89+
NUM_PROCESSES=${NUM_PROCESSES} \
90+
timeout 45m \
91+
ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh "${lib}"
8992

9093
set -e
9194
rapids-logger "Test script exiting with value: ${EXITCODE}"

ci/test_cpp.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,26 @@ set +e
1717
export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
1818

1919
rapids-logger "Run libcudf gtests"
20-
./ci/run_cudf_ctests.sh -j20
20+
timeout 30m ./ci/run_cudf_ctests.sh -j20
2121
SUITEERROR=$?
2222

2323
if (( SUITEERROR == 0 )); then
2424
rapids-logger "Run libcudf examples"
25-
./ci/run_cudf_examples.sh
25+
timeout 30m ./ci/run_cudf_examples.sh
2626
SUITEERROR=$?
2727
fi
2828

2929
if (( SUITEERROR == 0 )); then
3030
rapids-logger "Run libcudf_kafka gtests"
31-
./ci/run_cudf_kafka_ctests.sh -j20
31+
timeout 30m ./ci/run_cudf_kafka_ctests.sh -j20
3232
SUITEERROR=$?
3333
fi
3434

3535
# Ensure that benchmarks are runnable
3636
rapids-logger "Run tests of libcudf benchmarks"
3737

3838
if (( SUITEERROR == 0 )); then
39-
./ci/run_cudf_benchmark_smoketests.sh
39+
timeout 30m ./ci/run_cudf_benchmark_smoketests.sh
4040
SUITEERROR=$?
4141
fi
4242

ci/test_cpp_memcheck.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ source ./ci/test_cpp_common.sh
1111

1212
rapids-logger "Memcheck gtests with rmm_mode=cuda"
1313

14-
./ci/run_cudf_memcheck_ctests.sh && EXITCODE=$? || EXITCODE=$?;
14+
timeout 2h ./ci/run_cudf_memcheck_ctests.sh && EXITCODE=$? || EXITCODE=$?;
1515

1616
rapids-logger "Test script exiting with value: $EXITCODE"
1717
# shellcheck disable=SC2086

ci/test_cudf_polars_polars_tests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ trap set_exitcode ERR
4949
set +e
5050

5151
rapids-logger "Run polars tests"
52-
./ci/run_cudf_polars_polars_tests.sh
52+
timeout 30m ./ci/run_cudf_polars_polars_tests.sh
5353

5454
trap ERR
5555
set -e

ci/test_cudf_polars_with_rapidsmpf.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ set +e
4444
rapids-logger "Running cudf_polars tests with rapidsmpf"
4545

4646
# Run cudf_polars tests with rapidsmpf using dedicated test runner
47-
./ci/run_cudf_polars_with_rapidsmpf_pytests.sh \
47+
timeout 15m ./ci/run_cudf_polars_with_rapidsmpf_pytests.sh \
4848
--no-cov \
4949
--numprocesses=8 \
5050
--dist=worksteal \

ci/test_java.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ export LIBCUDF_LARGE_STRINGS_ENABLED=0
4242

4343
rapids-logger "Run Java tests"
4444
pushd java
45-
mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
45+
timeout 30m mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
4646
popd
4747

4848
rapids-logger "Test script exiting with value: $EXITCODE"

ci/test_narwhals.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ test_fill_null_series_limit_as_none[cudf] \
3838
"
3939

4040
rapids-logger "Run narwhals tests for cuDF"
41-
PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 python -m pytest \
41+
PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 \
42+
timeout 15m \
43+
python -m pytest \
4244
--cache-clear \
4345
-p xdist \
4446
-p env \
@@ -65,6 +67,7 @@ CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE=805306368 \
6567
CUDF_POLARS__EXECUTOR__FALLBACK_MODE=silent \
6668
PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 \
6769
NARWHALS_POLARS_GPU=1 \
70+
timeout 15m \
6871
python -m pytest \
6972
--cache-clear \
7073
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars-narwhals.xml" \
@@ -124,7 +127,10 @@ test_explode_multiple_cols or \
124127
(test_get_dtype_backend and pyarrow and (pandas or modin)) \
125128
"
126129

127-
PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 NARWHALS_DEFAULT_CONSTRUCTORS=pandas python -m pytest \
130+
PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 \
131+
NARWHALS_DEFAULT_CONSTRUCTORS=pandas \
132+
timeout 15m \
133+
python -m pytest \
128134
-p cudf.pandas \
129135
--cache-clear \
130136
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-pandas-narwhals.xml" \

ci/test_notebooks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ for nb in $(find . -name "*.ipynb"); do
5858
echo "--------------------------------------------------------------------------------"
5959
else
6060
nvidia-smi
61-
${NBTEST} "${nbBasename}"
61+
timeout 10m ${NBTEST} "${nbBasename}"
6262
fi
6363
done
6464

ci/test_python_cudf.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ trap "EXITCODE=1" ERR
1818
set +e
1919

2020
rapids-logger "pytest pylibcudf"
21-
./ci/run_pylibcudf_pytests.sh \
21+
timeout 40m ./ci/run_pylibcudf_pytests.sh \
2222
--junitxml="${RAPIDS_TESTS_DIR}/junit-pylibcudf.xml" \
2323
--numprocesses=8 \
2424
--dist=worksteal \
@@ -28,7 +28,7 @@ rapids-logger "pytest pylibcudf"
2828
--cov-report=term
2929

3030
rapids-logger "pytest cudf"
31-
./ci/run_cudf_pytests.sh \
31+
timeout 40m ./ci/run_cudf_pytests.sh \
3232
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
3333
--numprocesses=8 \
3434
--dist=worksteal \
@@ -43,7 +43,7 @@ rapids-logger "pytest cudf"
4343
# They do not generate meaningful performance measurements.
4444

4545
rapids-logger "pytest for cudf benchmarks"
46-
./ci/run_cudf_pytest_benchmarks.sh \
46+
timeout 40m ./ci/run_cudf_pytest_benchmarks.sh \
4747
--benchmark-disable \
4848
--numprocesses=8 \
4949
--dist=worksteal \
@@ -53,7 +53,7 @@ rapids-logger "pytest for cudf benchmarks"
5353
--cov-report=term
5454

5555
rapids-logger "pytest for cudf benchmarks using pandas"
56-
./ci/run_cudf_pandas_pytest_benchmarks.sh \
56+
timeout 40m ./ci/run_cudf_pandas_pytest_benchmarks.sh \
5757
--benchmark-disable \
5858
--numprocesses=8 \
5959
--dist=worksteal \

ci/test_python_other.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ trap "EXITCODE=1" ERR
1818
set +e
1919

2020
rapids-logger "pytest dask_cudf"
21-
./ci/run_dask_cudf_pytests.sh \
21+
timeout 30m ./ci/run_dask_cudf_pytests.sh \
2222
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
2323
--numprocesses=8 \
2424
--dist=worksteal \
@@ -28,11 +28,11 @@ rapids-logger "pytest dask_cudf"
2828
--cov-report=term
2929

3030
rapids-logger "pytest cudf_kafka"
31-
./ci/run_cudf_kafka_pytests.sh \
31+
timeout 30m ./ci/run_cudf_kafka_pytests.sh \
3232
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"
3333

3434
rapids-logger "pytest custreamz"
35-
./ci/run_custreamz_pytests.sh \
35+
timeout 30m ./ci/run_custreamz_pytests.sh \
3636
--junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
3737
--numprocesses=8 \
3838
--dist=worksteal \
@@ -42,7 +42,7 @@ rapids-logger "pytest custreamz"
4242
--cov-report=term
4343

4444
rapids-logger "pytest cudf-polars"
45-
./ci/run_cudf_polars_pytests.sh \
45+
timeout 30m ./ci/run_cudf_polars_pytests.sh \
4646
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" \
4747
--numprocesses=8 \
4848
--dist=worksteal \

0 commit comments

Comments
 (0)