Skip to content

Commit 7fcf6e3

Browse files
Add progress markers, unbuffered output, and log replay
- Print "Starting/Finished test X on GPU Y" to trace during execution - Capture per-test output to .log files (unbuffered with python -u) - Replay all logs sequentially into trace after parallel section - Trace reads like sequential execution; per-test .log files in artifacts - OOM errors captured via unbuffered writes before process death
1 parent fbbfb86 commit 7fcf6e3

File tree

1 file changed

+23
-2
lines changed

1 file changed

+23
-2
lines changed

qa/L0_pytorch_unittest/test.sh

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,16 @@ function run_test() {
5656

5757
if [ "$NUM_GPUS" -le 1 ]; then
5858
# Single GPU: run synchronously (identical to original behavior)
59-
eval "${env_prefix} python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \
59+
eval "${env_prefix} python3 -u -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \
6060
|| test_fail "$fail_label"
6161
else
6262
# Multi GPU: run in background on assigned GPU
63+
echo ">>> Starting: ${fail_label} on GPU ${GPU_LIST[$gpu_id]}"
6364
(
64-
eval "CUDA_VISIBLE_DEVICES=${GPU_LIST[$gpu_id]} ${env_prefix} python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \
65+
eval "CUDA_VISIBLE_DEVICES=${GPU_LIST[$gpu_id]} ${env_prefix} python3 -u -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \
66+
> "$XML_LOG_DIR/${xml_name%.xml}.log" 2>&1 \
6567
|| test_fail "$fail_label"
68+
echo ">>> Finished: ${fail_label} on GPU ${GPU_LIST[$gpu_id]}"
6669
) &
6770
fi
6871

@@ -121,6 +124,24 @@ if [ "$NUM_GPUS" -gt 1 ]; then
121124
wait
122125
fi
123126

127+
# ── Replay per-test logs into trace ──────────────────────────────────────────
128+
129+
if [ "$NUM_GPUS" -gt 1 ]; then
130+
echo ""
131+
echo "=== Per-test output (replayed from parallel execution) ==="
132+
for logfile in "$XML_LOG_DIR"/*.log; do
133+
if [ -f "$logfile" ]; then
134+
echo ""
135+
echo "────────────────────────────────────────────────────────"
136+
echo ">>> $(basename "$logfile" .log)"
137+
echo "────────────────────────────────────────────────────────"
138+
cat "$logfile"
139+
fi
140+
done
141+
echo ""
142+
echo "=== End of per-test output ==="
143+
fi
144+
124145
# ── Report results ──────────────────────────────────────────────────────────
125146

126147
if [ -s "$FAIL_DIR/failures" ]; then

0 commit comments

Comments
 (0)