Replace OS sleep with GPU nanosleep kernel in event timing test (#1285)

rwgk · web-flow · commit f0af76dc741a · 2025-11-26T08:35:18.000-08:00
* Replace timing-based event test with deterministic elapsed-time check The previous test attempted to measure a real sleep delay between two event records, which introduced flakiness (especially on Windows/WDDM) and tested OS/driver timing behavior rather than the __sub__ implementation itself. This change replaces the test with a minimal, deterministic version that: * records two back-to-back events on the same stream * synchronizes on the second event to ensure both timestamps are valid * asserts that cuEventElapsedTime returns a finite, non-negative float This exercises the success path of Event.__sub__ without depending on actual GPU/OS timing characteristics, or requiring artificial GPU work. * cuda_core/tests/helpers/__init__.py: also use CUDA_HOME * Revert "cuda_core/tests/helpers/__init__.py: also use CUDA_HOME" This reverts commit 605f1ef. * Use nanosleep kernel in test_event_elapsed_time_basic for deterministic timing Replace the back-to-back event record test with a version that uses a __nanosleep kernel between events. This ensures a guaranteed positive elapsed time (delta_ms > 10) without depending on OS/driver timing characteristics or requiring artificial GPU work beyond the minimal nanosleep delay. The kernel sleeps for 20ms (double the assertion threshold of 10ms), providing a large safety margin above the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic and non-flaky across platforms including Windows/WDDM. * Fix nanosleep kernel to use clock64() loop for guaranteed duration Replace single __nanosleep() call with clock64()-based loop to ensure the kernel actually waits for the full 20ms duration. A single __nanosleep() call doesn't guarantee the full sleep duration, which caused measured times to be orders of magnitude less than expected (~0.2ms instead of ~20ms). The new implementation: - Uses clock64() to measure actual elapsed time - Loops until 20ms worth of clock cycles have elapsed - Uses __nanosleep(1000000) inside the loop to yield and avoid 100% CPU spin This ensures delta_ms > 10 assertion is reliable and the test passes deterministically. * clock64() return type is documented as `long long int`: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#time-function * Use device.arch instead of joining device.compute_capability * cusor-generated cuda_core/tests/helpers/nanosleep_kernel.py * Change NanosleepKernel API to sleep_duration_ms * Rename back to test_timing_success * Streamline a comment * Polish comments. Make the code more similar to the existing code. * Simplify nanosleep_kernel implementation.
diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import (
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
+)
+
+
+class NanosleepKernel:
+    """
+    Manages a kernel that sleeps for a specified duration using clock64().
+    """
+
+    def __init__(self, device, sleep_duration_ms: int = 20):
+        """
+        Initialize the nanosleep kernel.
+
+        Args:
+            device: CUDA device to compile the kernel for
+            sleep_duration_ms: Duration to sleep in milliseconds (default: 20)
+        """
+        code = f"""
+        extern "C"
+        __global__ void nanosleep_kernel() {{
+            // The maximum sleep duration is approximately 1 millisecond.
+            unsigned int one_ms = 1000000U;
+            for (unsigned int i = 0; i < {sleep_duration_ms}; ++i) {{
+                __nanosleep(one_ms);
+            }}
+        }}
+        """
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin")
+        self.kernel = mod.get_kernel("nanosleep_kernel")
+
+    def launch(self, stream):
+        """Launch the nanosleep kernel on the given stream."""
+        config = LaunchConfig(grid=1, block=1)
+        launch(stream, config, self.kernel)
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -1,8 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-import time
+
+import math
 
 import cuda.core.experimental
 import pytest
@@ -12,8 +12,7 @@
     EventOptions,
 )
 from helpers.latch import LatchKernel
-
-from cuda_python_test_helpers import IS_WSL
+from helpers.nanosleep_kernel import NanosleepKernel
 
 
 def test_event_init_disabled():
@@ -23,25 +22,28 @@ def test_event_init_disabled():
 
 def test_timing_success(init_cuda):
     options = EventOptions(enable_timing=True)
-    stream = Device().create_stream()
-    delay_seconds = 0.5
+    device = Device()
+    stream = device.create_stream()
+
+    # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay.
+    # This guarantees elapsed_time_ms > 10 without depending on OS/driver timing characteristics.
+    nanosleep = NanosleepKernel(device, sleep_duration_ms=20)
+
     e1 = stream.record(options=options)
-    time.sleep(delay_seconds)
+    nanosleep.launch(stream)  # Insert a guaranteed delay
     e2 = stream.record(options=options)
     e2.sync()
     elapsed_time_ms = e2 - e1
     assert isinstance(elapsed_time_ms, float)
-    # Using a generous tolerance, to avoid flaky tests:
-    # We only want to exercise the __sub__ method, this test is not meant
-    # to stress-test the CUDA driver or time.sleep().
-    delay_ms = delay_seconds * 1000
-    if os.name == "nt" or IS_WSL:  # noqa: SIM108
-        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-        generous_tolerance = 100
-    else:
-        # Most modern Linux kernels have a default timer resolution of 1 ms.
-        generous_tolerance = 20
-    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    # Sanity check: cuEventElapsedTime should always return a finite float for two completed
+    # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
+    # undefined behavior, without asserting anything about the magnitude of the measured time.
+    assert math.isfinite(elapsed_time_ms)
+    # With the nanosleep kernel between events, the kernel sleeps for 20 ms using clock64(),
+    # so elapsed_time_ms should definitely be larger than 10 ms. This provides a large safety
+    # margin above the ~0.5 microsecond resolution of cudaEventElapsedTime(), which should
+    # make this test deterministic and non-flaky.
+    assert elapsed_time_ms > 10
 
 
 def test_is_sync_busy_waited(init_cuda):