Skip to content

Commit bb44c92

Browse files
authored
wrap fio command in a kernel timeout of 20mins (#110)
1 parent 7a8696c commit bb44c92

File tree

2 files changed

+16
-15
lines changed

2 files changed

+16
-15
lines changed

distributed-micro-benchmark/workers/runner.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ run_test_iterations() {
6666
MONITOR_PID=$(cat "$MONITOR_PID_FILE" 2>/dev/null)
6767

6868
# Populate Metadata
69-
# mkdir -p "$TEST_DATA_DIR"
70-
# if ! ls -R "$TEST_DATA_DIR" 1> /dev/null 2>&1; then :; fi
71-
echo "Not populating metadata!!"
69+
mkdir -p "$TEST_DATA_DIR"
70+
if ! ls -R "$TEST_DATA_DIR" 1> /dev/null 2>&1; then :; fi
71+
echo "Populated metadata for $TEST_DATA_DIR"
7272
# Drop Cache
7373
sync
7474
sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches' 2>/dev/null || true
@@ -86,12 +86,12 @@ run_test_iterations() {
8686
echo " [$(date +'%H:%M:%S')] Starting FIO execution...!!!..."
8787
fi
8888

89-
# Run FIO
89+
# Run FIO wrapped in an OS-level timeout (20 minutes / 1200s) to prevent infinite hanging
9090
OUTPUT_FILE="${TEST_DIR}/fio_output_${i}.json"
91-
if ! fio "$FIO_JOB" $FIO_TIME_ARGS --alloc-size=$((2 * 1024 * 1024)) --output-format=json --output="$OUTPUT_FILE"; then
92-
echo "ERROR: FIO execution failed" >&2
91+
if ! timeout -k 30 1200 fio "$FIO_JOB" $FIO_TIME_ARGS --alloc-size=$((2 * 1024 * 1024)) --output-format=json --output="$OUTPUT_FILE"; then
92+
echo "ERROR: FIO execution failed or OS TIMEOUT REACHED" >&2
9393
stop_monitoring "$MONITOR_PID" "$MONITOR_STOP_FLAG"
94-
sudo fusermount -u "$MOUNT_DIR" 2>/dev/null || sudo umount -f "$MOUNT_DIR" 2>/dev/null || true
94+
sudo fusermount -uz "$MOUNT_DIR" 2>/dev/null || sudo umount -l "$MOUNT_DIR" 2>/dev/null || true
9595
return 1
9696
fi
9797

@@ -104,7 +104,7 @@ run_test_iterations() {
104104
stop_monitoring "$MONITOR_PID" "$MONITOR_STOP_FLAG"
105105

106106
# Unmount
107-
sudo fusermount -u "$MOUNT_DIR" 2>/dev/null || sudo umount -f "$MOUNT_DIR" 2>/dev/null || true
107+
sudo fusermount -uz "$MOUNT_DIR" 2>/dev/null || sudo umount -l "$MOUNT_DIR" 2>/dev/null || true
108108

109109
# Clean cache again
110110
sync

distributed-micro-benchmark/workers/worker.sh

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,19 @@ MOUNT_DIR="$WORKSPACE/mnt"
4444
# Error handling
4545
cleanup_gcsfuse() {
4646
echo "Cleaning up GCSFuse/FIO mounts and processes..." >&2
47-
47+
48+
# Kill any lingering GCSFuse and FIO processes aggressively
49+
echo " Killing any orphaned GCSFuse and FIO processes..." >&2
50+
sudo pkill -9 fio 2>/dev/null || true
51+
sudo pkill -9 gcsfuse 2>/dev/null || true
52+
4853
# Unmount if mounted
4954
if [ -n "${MOUNT_DIR:-}" ] && mountpoint -q "$MOUNT_DIR" 2>/dev/null; then
5055
echo " Unmounting $MOUNT_DIR..." >&2
51-
sudo fusermount -u "$MOUNT_DIR" 2>/dev/null || sudo umount -f "$MOUNT_DIR" 2>/dev/null || true
56+
sudo fusermount -uz "$MOUNT_DIR" 2>/dev/null || true
57+
sudo umount -l "$MOUNT_DIR" 2>/dev/null || true
5258
sleep 1
5359
fi
54-
55-
# Kill any lingering GCSFuse and FIO processes aggressively
56-
echo " Killing any orphaned GCSFuse and FIO processes..." >&2
57-
sudo pkill -9 -f gcsfuse 2>/dev/null || true
58-
sudo pkill -9 -f fio 2>/dev/null || true
5960
}
6061

6162
handle_error() {

0 commit comments

Comments
 (0)