NVIDIA · rwgk · Nov 26, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -1,47 +1,71 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-import time
+
+import math
 
 import cuda.core.experimental
 import pytest
 from cuda.core.experimental import (
     Device,
     Event,
     EventOptions,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
 )
 from helpers.latch import LatchKernel
 
-from cuda_python_test_helpers import IS_WSL
-
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-def test_timing_success(init_cuda):
+def test_event_elapsed_time_basic(init_cuda):
+    device = Device()
     options = EventOptions(enable_timing=True)
-    stream = Device().create_stream()
-    delay_seconds = 0.5
+    stream = device.create_stream()
+
+    # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay
+    # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
+    # Use clock64() in a loop to ensure we actually wait for the full duration
+    clock_rate_hz = device.properties.clock_rate * 1000
+    sleep_cycles = int(0.020 * clock_rate_hz)  # 20 ms in clock cycles
+    code = f"""
+    extern "C"
+    __global__ void nanosleep_kernel() {{
+        unsigned long long start = clock64();
+        while (clock64() - start < {sleep_cycles}) {{
+            __nanosleep(1000000); // 1 ms yield to avoid 100% spin
+        }}
+    }}
+    """
+    arch = "".join(f"{i}" for i in device.compute_capability)
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin")
+    kernel = mod.get_kernel("nanosleep_kernel")
+
     e1 = stream.record(options=options)
-    time.sleep(delay_seconds)
+    # Launch the nanosleep kernel to introduce a guaranteed delay
+    config = LaunchConfig(grid=1, block=1)
+    launch(stream, config, kernel)
     e2 = stream.record(options=options)
     e2.sync()
-    elapsed_time_ms = e2 - e1
-    assert isinstance(elapsed_time_ms, float)
-    # Using a generous tolerance, to avoid flaky tests:
-    # We only want to exercise the __sub__ method, this test is not meant
-    # to stress-test the CUDA driver or time.sleep().
-    delay_ms = delay_seconds * 1000
-    if os.name == "nt" or IS_WSL:  # noqa: SIM108
-        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-        generous_tolerance = 100
-    else:
-        # Most modern Linux kernels have a default timer resolution of 1 ms.
-        generous_tolerance = 20
-    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    delta_ms = e2 - e1
+    assert isinstance(delta_ms, float)
+    # Sanity check: cuEventElapsedTime should always return a finite float for two completed
+    # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
+    # undefined behavior, without asserting anything about the magnitude of the measured time.
+    assert math.isfinite(delta_ms)
+    # With the nanosleep kernel between events, we can assert a positive elapsed time.
+    # The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms.
+    # Using a 10 ms threshold (half the sleep duration) provides a large safety margin above
+    # the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic
+    # and non-flaky.
+    assert delta_ms > 10
-    assert delta_ms > 10
+    assert delta_ms >= 10
-    assert delta_ms > 10
+    assert delta_ms >= 10
 
 
 def test_is_sync_busy_waited(init_cuda):