Skip to content
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 45 additions & 21 deletions cuda_core/tests/test_event.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,71 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import os
import time

import math

import cuda.core.experimental
import pytest
from cuda.core.experimental import (
Device,
Event,
EventOptions,
LaunchConfig,
Program,
ProgramOptions,
launch,
)
from helpers.latch import LatchKernel

from cuda_python_test_helpers import IS_WSL


def test_event_init_disabled():
with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
cuda.core.experimental._event.Event() # Ensure back door is locked.


def test_timing_success(init_cuda):
def test_event_elapsed_time_basic(init_cuda):
device = Device()
options = EventOptions(enable_timing=True)
stream = Device().create_stream()
delay_seconds = 0.5
stream = device.create_stream()

# Create a simple kernel that sleeps for 20 ms to ensure a measurable delay
# This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
# Use clock64() in a loop to ensure we actually wait for the full duration
clock_rate_hz = device.properties.clock_rate * 1000
sleep_cycles = int(0.020 * clock_rate_hz) # 20 ms in clock cycles
code = f"""
extern "C"
__global__ void nanosleep_kernel() {{
unsigned long long start = clock64();
while (clock64() - start < {sleep_cycles}) {{
__nanosleep(1000000); // 1 ms yield to avoid 100% spin
}}
}}
"""
arch = "".join(f"{i}" for i in device.compute_capability)
program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin")
kernel = mod.get_kernel("nanosleep_kernel")

e1 = stream.record(options=options)
time.sleep(delay_seconds)
# Launch the nanosleep kernel to introduce a guaranteed delay
config = LaunchConfig(grid=1, block=1)
launch(stream, config, kernel)
e2 = stream.record(options=options)
e2.sync()
elapsed_time_ms = e2 - e1
assert isinstance(elapsed_time_ms, float)
# Using a generous tolerance, to avoid flaky tests:
# We only want to exercise the __sub__ method, this test is not meant
# to stress-test the CUDA driver or time.sleep().
delay_ms = delay_seconds * 1000
if os.name == "nt" or IS_WSL: # noqa: SIM108
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
generous_tolerance = 100
else:
# Most modern Linux kernels have a default timer resolution of 1 ms.
generous_tolerance = 20
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
delta_ms = e2 - e1
assert isinstance(delta_ms, float)
# Sanity check: cuEventElapsedTime should always return a finite float for two completed
# events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
# undefined behavior, without asserting anything about the magnitude of the measured time.
assert math.isfinite(delta_ms)
# With the nanosleep kernel between events, we can assert a positive elapsed time.
# The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms.
# Using a 10 ms threshold (half the sleep duration) provides a large safety margin above
# the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic
# and non-flaky.
assert delta_ms > 10
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Should equality be included?

Suggested change
assert delta_ms > 10
assert delta_ms >= 10

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically: Because of the large safety margin (expected 10 ms) it shouldn't matter at all.

Readability aspect: Making an effort to be precise here would send the wrong message, by distracting from the large safety margin.



def test_is_sync_busy_waited(init_cuda):
Expand Down
Loading