-
Notifications
You must be signed in to change notification settings - Fork 226
Replace OS sleep with GPU nanosleep kernel in event timing test #1285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
9b11ffa
605f1ef
29f7882
7d5ee2b
5eba5ac
58f6685
ad16933
55d9b44
3762490
528e77e
f585ce0
909f380
18563e8
35510e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,47 +1,71 @@ | ||||||
| # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||
|
|
||||||
| import os | ||||||
| import time | ||||||
|
|
||||||
| import math | ||||||
|
|
||||||
| import cuda.core.experimental | ||||||
| import pytest | ||||||
| from cuda.core.experimental import ( | ||||||
| Device, | ||||||
| Event, | ||||||
| EventOptions, | ||||||
| LaunchConfig, | ||||||
| Program, | ||||||
| ProgramOptions, | ||||||
| launch, | ||||||
| ) | ||||||
| from helpers.latch import LatchKernel | ||||||
|
|
||||||
| from cuda_python_test_helpers import IS_WSL | ||||||
|
|
||||||
|
|
||||||
| def test_event_init_disabled(): | ||||||
| with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."): | ||||||
| cuda.core.experimental._event.Event() # Ensure back door is locked. | ||||||
|
|
||||||
|
|
||||||
| def test_timing_success(init_cuda): | ||||||
| def test_event_elapsed_time_basic(init_cuda): | ||||||
| device = Device() | ||||||
| options = EventOptions(enable_timing=True) | ||||||
| stream = Device().create_stream() | ||||||
| delay_seconds = 0.5 | ||||||
| stream = device.create_stream() | ||||||
|
|
||||||
| # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay | ||||||
| # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics | ||||||
| # Use clock64() in a loop to ensure we actually wait for the full duration | ||||||
| clock_rate_hz = device.properties.clock_rate * 1000 | ||||||
| sleep_cycles = int(0.020 * clock_rate_hz) # 20 ms in clock cycles | ||||||
| code = f""" | ||||||
| extern "C" | ||||||
| __global__ void nanosleep_kernel() {{ | ||||||
| unsigned long long start = clock64(); | ||||||
| while (clock64() - start < {sleep_cycles}) {{ | ||||||
| __nanosleep(1000000); // 1 ms yield to avoid 100% spin | ||||||
| }} | ||||||
| }} | ||||||
| """ | ||||||
| arch = "".join(f"{i}" for i in device.compute_capability) | ||||||
| program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") | ||||||
rwgk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| prog = Program(code, code_type="c++", options=program_options) | ||||||
| mod = prog.compile("cubin") | ||||||
| kernel = mod.get_kernel("nanosleep_kernel") | ||||||
|
|
||||||
| e1 = stream.record(options=options) | ||||||
| time.sleep(delay_seconds) | ||||||
| # Launch the nanosleep kernel to introduce a guaranteed delay | ||||||
| config = LaunchConfig(grid=1, block=1) | ||||||
| launch(stream, config, kernel) | ||||||
| e2 = stream.record(options=options) | ||||||
| e2.sync() | ||||||
| elapsed_time_ms = e2 - e1 | ||||||
| assert isinstance(elapsed_time_ms, float) | ||||||
| # Using a generous tolerance, to avoid flaky tests: | ||||||
| # We only want to exercise the __sub__ method, this test is not meant | ||||||
| # to stress-test the CUDA driver or time.sleep(). | ||||||
| delay_ms = delay_seconds * 1000 | ||||||
| if os.name == "nt" or IS_WSL: # noqa: SIM108 | ||||||
| # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. | ||||||
| generous_tolerance = 100 | ||||||
| else: | ||||||
| # Most modern Linux kernels have a default timer resolution of 1 ms. | ||||||
| generous_tolerance = 20 | ||||||
| assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance | ||||||
| delta_ms = e2 - e1 | ||||||
| assert isinstance(delta_ms, float) | ||||||
| # Sanity check: cuEventElapsedTime should always return a finite float for two completed | ||||||
| # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general | ||||||
| # undefined behavior, without asserting anything about the magnitude of the measured time. | ||||||
| assert math.isfinite(delta_ms) | ||||||
leofang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| # With the nanosleep kernel between events, we can assert a positive elapsed time. | ||||||
| # The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms. | ||||||
| # Using a 10 ms threshold (half the sleep duration) provides a large safety margin above | ||||||
| # the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic | ||||||
| # and non-flaky. | ||||||
| assert delta_ms > 10 | ||||||
|
||||||
| assert delta_ms > 10 | |
| assert delta_ms >= 10 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically: Because of the large safety margin (expected 10 ms) it shouldn't matter at all.
Readability aspect: Making an effort to be precise here would send the wrong message, by distracting from the large safety margin.
Uh oh!
There was an error while loading. Please reload this page.