From 6d789cb19c0b8d8279fb72e9328b466ef0640dec Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 11:20:34 -0800 Subject: [PATCH 01/44] commit squash: add ProgramOptions to Program --- cuda_core/cuda/core/experimental/__init__.py | 2 +- cuda_core/cuda/core/experimental/_program.py | 371 ++++++++++++++++++- cuda_core/cuda/core/experimental/_stream.py | 4 +- cuda_core/cuda/core/experimental/_utils.py | 7 + cuda_core/docs/source/release/0.2.0-notes.md | 12 + cuda_core/examples/saxpy.py | 9 +- cuda_core/examples/vector_add.py | 14 +- cuda_core/tests/test_program.py | 39 +- 8 files changed, 425 insertions(+), 33 deletions(-) create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9b978398..c48f2c2c 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -5,5 +5,5 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch -from cuda.core.experimental._program import Program +from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 6cf13c83..b8442f83 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -2,9 +2,353 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + from cuda import nvrtc +from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._utils import _handle_boolean_option, handle_return + + +@dataclass +class ProgramOptions: + """Customizable :obj:`ProgramOptions` for NVRTC. + + Attributes + ---------- + gpu_architecture : str, optional + Specify the name of the class of GPU architectures for which the input must be compiled. + Valid values: compute_50, compute_52, compute_53, compute_60, compute_61, compute_62, compute_70, compute_72, + compute_75, compute_80, compute_87, compute_89, compute_90, compute_90a, sm_50, sm_52, sm_53, sm_60, sm_61, + sm_62, sm_70, sm_72, sm_75, sm_80, sm_87, sm_89, sm_90, sm_90a. + Default: compute_52 + Maps to: --gpu-architecture= (-arch) + relocatable_device_code : bool, optional + Enable (disable) the generation of relocatable device code. + Default: False + Maps to: --relocatable-device-code={true|false} (-rdc) + extensible_whole_program : bool, optional + Do extensible whole program compilation of device code. + Default: False + Maps to: --extensible-whole-program (-ewp) + device_debug : bool, optional + Generate debug information. If --dopt is not specified, then turns off all optimizations. + Default: False + Maps to: --device-debug (-G) + generate_line_info : bool, optional + Generate line-number information. + Default: False + Maps to: --generate-line-info (-lineinfo) + device_optimize : bool, optional + Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation + for optimized device code. + Default: None + Maps to: --dopt on (-dopt) + ptxas_options : Union[str, List[str]], optional + Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. + For example ["-v", "-O2"]. + Default: None + Maps to: --ptxas-options (-Xptxas) + maxrregcount : int, optional + Specify the maximum amount of registers that GPU functions can use. + Default: None + Maps to: --maxrregcount= (-maxrregcount) + ftz : bool, optional + When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal + values. + Default: False + Maps to: --ftz={true|false} (-ftz) + prec_sqrt : bool, optional + For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. + Default: True + Maps to: --prec-sqrt={true|false} (-prec-sqrt) + prec_div : bool, optional + For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster + approximation. + Default: True + Maps to: --prec-div={true|false} (-prec-div) + fmad : bool, optional + Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point + multiply-add operations. + Default: True + Maps to: --fmad={true|false} (-fmad) + use_fast_math : bool, optional + Make use of fast math operations. + Default: False + Maps to: --use_fast_math (-use_fast_math) + extra_device_vectorization : bool, optional + Enables more aggressive device code vectorization in the NVVM optimizer. + Default: False + Maps to: --extra-device-vectorization (-extra-device-vectorization) + dlink_time_opt : bool, optional + Generate intermediate code for later link-time optimization. + Default: False + Maps to: --dlink-time-opt (-dlto) + gen_opt_lto : bool, optional + Run the optimizer passes before generating the LTO IR. + Default: False + Maps to: --gen-opt-lto (-gen-opt-lto) + define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional + Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of + strings, in which case the first element is defined as the second, or a list of strings or tuples. + Default: None + Maps to: --define-macro= (-D) + undefine_macro : Union[str, List[str]], optional + Cancel any previous definition of a macro, or list of macros. + Default: None + Maps to: --undefine-macro= (-U) + include_path : Union[str, List[str]], optional + Add the directory or directories to the list of directories to be searched for headers. + Default: None + Maps to: --include-path= (-I) + pre_include : Union[str, List[str]], optional + Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. + Default: None + Maps to: --pre-include=
(-include) + no_source_include : bool, optional + Disable the default behavior of adding the directory of each input source to the include path. + Default: False + Maps to: --no-source-include (-no-source-include) + std : str, optional + Set language dialect to C++03, C++11, C++14, C++17 or C++20. + Default: c++17 + Maps to: --std={c++03|c++11|c++14|c++17|c++20} (-std) + builtin_move_forward : bool, optional + Provide builtin definitions of std::move and std::forward. + Default: True + Maps to: --builtin-move-forward={true|false} (-builtin-move-forward) + builtin_initializer_list : bool, optional + Provide builtin definitions of std::initializer_list class and member functions. + Default: True + Maps to: --builtin-initializer-list={true|false} (-builtin-initializer-list) + disable_warnings : bool, optional + Inhibit all warning messages. + Default: False + Maps to: --disable-warnings (-w) + restrict : bool, optional + Programmer assertion that all kernel pointer parameters are restrict pointers. + Default: False + Maps to: --restrict (-restrict) + device_as_default_execution_space : bool, optional + Treat entities with no execution space annotation as __device__ entities. + Default: False + Maps to: --device-as-default-execution-space (-default-device) + device_int128 : bool, optional + Allow the __int128 type in device code. + Default: False + Maps to: --device-int128 (-device-int128) + optimization_info : str, optional + Provide optimization reports for the specified kind of optimization. + Default: None + Maps to: --optimization-info= (-opt-info) + no_display_error_number : bool, optional + Disable the display of a diagnostic number for warning messages. + Default: False + Maps to: --no-display-error-number (-no-err-no) + diag_error : str, optional + Emit error for specified diagnostic message number(s). + Default: None + Maps to: --diag-error=,… (-diag-error) + diag_suppress : str, optional + Suppress specified diagnostic message number(s). + Default: None + Maps to: --diag-suppress=,… (-diag-suppress) + diag_warn : str, optional + Emit warning for specified diagnostic message number(s). + Default: None + Maps to: --diag-warn=,… (-diag-warn) + brief_diagnostics : bool, optional + Disable or enable showing source line and column info in a diagnostic. + Default: False + Maps to: --brief-diagnostics={true|false} (-brief-diag) + time : str, optional + Generate a CSV table with the time taken by each compilation phase. + Default: None + Maps to: --time= (-time) + split_compile : int, optional + Perform compiler optimizations in parallel. + Default: 1 + Maps to: --split-compile= (-split-compile) + fdevice_syntax_only : bool, optional + Ends device compilation after front-end syntax checking. + Default: False + Maps to: --fdevice-syntax-only (-fdevice-syntax-only) + minimal : bool, optional + Omit certain language features to reduce compile time for small programs. + Default: False + Maps to: --minimal (-minimal) + """ + + gpu_architecture: Optional[str] = None + device_c: Optional[bool] = None + device_w: Optional[bool] = None + relocatable_device_code: Optional[bool] = None + extensible_whole_program: Optional[bool] = None + device_debug: Optional[bool] = None + generate_line_info: Optional[bool] = None + device_optimize: Optional[bool] = None + ptxas_options: Optional[Union[str, List[str]]] = None + maxrregcount: Optional[int] = None + ftz: Optional[bool] = None + prec_sqrt: Optional[bool] = None + prec_div: Optional[bool] = None + fmad: Optional[bool] = None + use_fast_math: Optional[bool] = None + extra_device_vectorization: Optional[bool] = None + dlink_time_opt: Optional[bool] = None + gen_opt_lto: Optional[bool] = None + define_macro: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None + undefine_macro: Optional[Union[str, List[str]]] = None + include_path: Optional[Union[str, List[str]]] = None + pre_include: Optional[Union[str, List[str]]] = None + no_source_include: Optional[bool] = None + std: Optional[str] = None + builtin_move_forward: Optional[bool] = None + builtin_initializer_list: Optional[bool] = None + disable_warnings: Optional[bool] = None + restrict: Optional[bool] = None + device_as_default_execution_space: Optional[bool] = None + device_int128: Optional[bool] = None + optimization_info: Optional[str] = None + no_display_error_number: Optional[bool] = None + diag_error: Optional[str] = None + diag_suppress: Optional[str] = None + diag_warn: Optional[str] = None + brief_diagnostics: Optional[bool] = None + time: Optional[str] = None + split_compile: Optional[int] = None + fdevice_syntax_only: Optional[bool] = None + minimal: Optional[bool] = None + + def __post_init__(self): + self._formatted_options = [] + if self.gpu_architecture is not None: + self._formatted_options.append(f"--gpu-architecture={self.gpu_architecture}") + else: + self._formatted_options.append( + "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability) + ) + if self.relocatable_device_code is not None: + self._formatted_options.append( + f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}" + ) + if self.extensible_whole_program is not None and self.extensible_whole_program: + self._formatted_options.append("--extensible-whole-program") + if self.device_debug is not None and self.device_debug: + self._formatted_options.append("--device-debug") + if self.generate_line_info is not None and self.generate_line_info: + self._formatted_options.append("--generate-line-info") + if self.device_optimize is not None: + self._formatted_options.append(f"--dopt={'on' if self.device_optimize else 'off'}") + if self.ptxas_options is not None: + self._formatted_options.append("--ptxas-options") + if isinstance(self.ptxas_options, list): + for option in self.ptxas_options: + self._formatted_options.append(option) + else: + self._formatted_options.append("self.ptxas_options") + if self.maxrregcount is not None: + self._formatted_options.append(f"--maxrregcount={self.maxrregcount}") + if self.ftz is not None: + self._formatted_options.append(f"--ftz={_handle_boolean_option(self.ftz)}") + if self.prec_sqrt is not None: + self._formatted_options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}") + if self.prec_div is not None: + self._formatted_options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}") + if self.fmad is not None: + self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fmad)}") + if self.use_fast_math is not None and self.use_fast_math: + self._formatted_options.append("--use_fast_math") + if self.extra_device_vectorization is not None and self.extra_device_vectorization: + self._formatted_options.append("--extra-device-vectorization") + if self.dlink_time_opt is not None and self.dlink_time_opt: + self._formatted_options.append("--dlink-time-opt") + if self.gen_opt_lto is not None and self.gen_opt_lto: + self._formatted_options.append("--gen-opt-lto") + if self.define_macro is not None: + if isinstance(self.define_macro, list): + for macro in self.define_macro: + if isinstance(macro, tuple): + assert len(macro) == 2 + self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}") + else: + self._formatted_options.append(f"--define-macro={macro}") + elif isinstance(self.define_macro, tuple): + assert len(self.define_macro) == 2 + self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}") + else: + self._formatted_options.append(f"--define-macro={self.define_macro}") + + if self.undefine_macro is not None: + if isinstance(self.undefine_macro, list): + for macro in self.undefine_macro: + self._formatted_options.append(f"--undefine-macro={macro}") + else: + self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") + if self.include_path is not None: + if isinstance(self.include_path, list): + for path in self.include_path: + self._formatted_options.append(f"--include-path={path}") + else: + self._formatted_options.append(f"--include-path={self.include_path}") + if self.pre_include is not None: + if isinstance(self.pre_include, list): + for header in self.pre_include: + self._formatted_options.append(f"--pre-include={header}") + else: + self._formatted_options.append(f"--pre-include={self.pre_include}") + if self.no_source_include is not None and self.no_source_include: + self._formatted_options.append("--no-source-include") + if self.std is not None: + self._formatted_options.append(f"--std={self.std}") + if self.builtin_move_forward is not None: + self._formatted_options.append( + f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}" + ) + if self.builtin_initializer_list is not None: + self._formatted_options.append( + f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}" + ) + if self.disable_warnings is not None and self.disable_warnings: + self._formatted_options.append("--disable-warnings") + if self.restrict is not None and self.restrict: + self._formatted_options.append("--restrict") + if self.device_as_default_execution_space is not None and self.device_as_default_execution_space: + self._formatted_options.append("--device-as-default-execution-space") + if self.device_int128 is not None and self.device_int128: + self._formatted_options.append("--device-int128") + if self.optimization_info is not None: + self._formatted_options.append(f"--optimization-info={self.optimization_info}") + if self.no_display_error_number is not None and self.no_display_error_number: + self._formatted_options.append("--no-display-error-number") + if self.diag_error is not None: + self._formatted_options.append(f"--diag-error={self.diag_error}") + if self.diag_suppress is not None: + self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") + if self.diag_warn is not None: + self._formatted_options.append(f"--diag-warn={self.diag_warn}") + if self.brief_diagnostics is not None: + self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}") + if self.time is not None: + self._formatted_options.append(f"--time={self.time}") + if self.split_compile is not None: + self._formatted_options.append(f"--split-compile={self.split_compile}") + if self.fdevice_syntax_only is not None and self.fdevice_syntax_only: + self._formatted_options.append("--fdevice-syntax-only") + if self.minimal is not None and self.minimal: + self._formatted_options.append("--minimal") + + def _as_bytes(self): + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + result = [] + for option in self._formatted_options: + result.append(option.encode()) + return result + + def __repr__(self): + # __TODO__ improve this + return self._formatted_options class Program: @@ -21,14 +365,16 @@ class Program: String of the CUDA Runtime Compilation program. code_type : Any String of the code type. Currently only ``"c++"`` is supported. - + options : ProgramOptions, optional + A ProgramOptions object to customize the compilation process. + See :obj:`ProgramOptions` for more information. """ - __slots__ = ("_handle", "_backend") + __slots__ = ("_handle", "_backend", "_options") _supported_code_type = ("c++",) _supported_target_type = ("ptx", "cubin", "ltoir") - def __init__(self, code, code_type): + def __init__(self, code, code_type, options: ProgramOptions = None): self._handle = None if code_type not in self._supported_code_type: raise NotImplementedError @@ -37,12 +383,16 @@ def __init__(self, code, code_type): if not isinstance(code, str): raise TypeError # TODO: support pre-loaded headers & include names - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved self._handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" else: raise NotImplementedError + if options is None: + self._options = [] + else: + self._options = options._as_bytes() + def __del__(self): """Return close(self).""" self.close() @@ -53,7 +403,7 @@ def close(self): handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) self._handle = None - def compile(self, target_type, options=(), name_expressions=(), logs=None): + def compile(self, target_type, name_expressions=(), logs=None): """Compile the program with a specific compilation type. Parameters @@ -61,9 +411,6 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): target_type : Any String of the targeted compilation type. Supported options are "ptx", "cubin" and "ltoir". - options : Union[List, Tuple], optional - List of compilation options associated with the backend - of this :obj:`Program`. (Default to no options) name_expressions : Union[List, Tuple], optional List of explicit name expressions to become accessible. (Default to no expressions) @@ -85,9 +432,9 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): if name_expressions: for n in name_expressions: handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle) - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - options = list(o.encode() for o in options) - handle_return(nvrtc.nvrtcCompileProgram(self._handle, len(options), options), handle=self._handle) + handle_return( + nvrtc.nvrtcCompileProgram(self._handle, len(self._options), self._options), handle=self._handle + ) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 6a68d175..5756dd34 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -211,9 +211,7 @@ def wait(self, event_or_stream: Union[Event, Stream]): try: stream = Stream._init(event_or_stream) except Exception as e: - raise ValueError( - "only an Event, Stream, or object supporting __cuda_stream__ can be waited" - ) from e + raise ValueError("only an Event, Stream, or object supporting __cuda_stream__ can be waited") from e else: stream = event_or_stream event = handle_return(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index 9cb47a33..74488a53 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -87,6 +87,13 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals return options +def _handle_boolean_option(option: bool) -> str: + """ + Convert a boolean option to a string representation. + """ + return str(option).lower() + + def precondition(checker: Callable[..., None], what: str = "") -> Callable: """ A decorator that adds checks to ensure any preconditions are met. diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md new file mode 100644 index 00000000..914e9c17 --- /dev/null +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -0,0 +1,12 @@ +# `cuda.core` Release notes + +Released on , 2024 + +## Hightlights +- Add ProgramOptions to facilitate the passing of runtime compile options to [Program](#program) + +## Limitations +- + +## Breaking Changes +- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`. diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index 8caa4d4a..c6b4be85 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -6,7 +6,7 @@ import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch # compute out = a * x + y code = """ @@ -29,13 +29,10 @@ s = dev.create_stream() # prepare program -prog = Program(code, code_type="c++") +program_options = ProgramOptions(std="c++11", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability)) +prog = Program(code, code_type="c++", options=program_options) mod = prog.compile( "cubin", - options=( - "-std=c++11", - "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability), - ), logs=sys.stdout, name_expressions=("saxpy", "saxpy"), ) diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 550eaf2a..60b9b0db 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -4,7 +4,7 @@ import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch # compute c = a + b code = """ @@ -26,15 +26,9 @@ s = dev.create_stream() # prepare program -prog = Program(code, code_type="c++") -mod = prog.compile( - "cubin", - options=( - "-std=c++17", - "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability), - ), - name_expressions=("vector_add",), -) +program_options = ProgramOptions(std="c++17", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability)) +prog = Program(code, code_type="c++", options=program_options) +mod = prog.compile("cubin", name_expressions=("vector_add",)) # run in single precision ker = mod.get_kernel("vector_add") diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index af94a7ba..e6ae3334 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -8,8 +8,45 @@ import pytest -from cuda.core.experimental import Program from cuda.core.experimental._module import Kernel, ObjectCode +from cuda.core.experimental._program import Program, ProgramOptions + + +def test_program_with_various_options(init_cuda): + code = 'extern "C" __global__ void my_kernel() {}' + + options_list = [ + ProgramOptions(ptxas_options="-v"), + ProgramOptions(ptxas_options=["-v", "-O3"]), + ProgramOptions(device_optimize=True, device_debug=True), + ProgramOptions(relocatable_device_code=True, maxrregcount=32), + ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), + ProgramOptions(fmad=False, use_fast_math=True), + ProgramOptions(extra_device_vectorization=True), + ProgramOptions(dlink_time_opt=True, gen_opt_lto=True), + ProgramOptions(define_macro="MY_MACRO"), + ProgramOptions(define_macro=("MY_MACRO", "99")), + ProgramOptions(define_macro=[("MY_MACRO", "99")]), + ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]), + ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]), + ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"), + ProgramOptions(pre_include="my_header.h", no_source_include=True), + ProgramOptions(builtin_initializer_list=False, disable_warnings=True), + ProgramOptions(restrict=True, device_as_default_execution_space=True), + ProgramOptions(device_int128=True, optimization_info="inline"), + ProgramOptions(no_display_error_number=True), + ProgramOptions(diag_error="1234", diag_suppress="5678"), + ProgramOptions(diag_warn="91011", brief_diagnostics=True), + ProgramOptions(time="compile_time.csv", split_compile=4), + ProgramOptions(fdevice_syntax_only=True, minimal=True), + ] + + # TODO compile the program once the CI is set up + for options in options_list: + program = Program(code, "c++", options) + assert program.backend == "nvrtc" + program.close() + assert program.handle is None def test_program_init_valid_code_type(): From 66ceb8508100fbbbe2b7ab3c547e895127a6c335 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 11:29:32 -0800 Subject: [PATCH 02/44] remove stream from commit --- cuda_core/cuda/core/experimental/_stream.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 5756dd34..6a68d175 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -211,7 +211,9 @@ def wait(self, event_or_stream: Union[Event, Stream]): try: stream = Stream._init(event_or_stream) except Exception as e: - raise ValueError("only an Event, Stream, or object supporting __cuda_stream__ can be waited") from e + raise ValueError( + "only an Event, Stream, or object supporting __cuda_stream__ can be waited" + ) from e else: stream = event_or_stream event = handle_return(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) From 37a945c92b99ffd0d2311ea9fc8c9c6b0f947972 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 11:42:54 -0800 Subject: [PATCH 03/44] modify doc source --- cuda_core/docs/source/api.rst | 4 ++++ cuda_core/docs/source/release.md | 1 + 2 files changed, 5 insertions(+) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 1cb9811b..32a4ac2d 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -31,3 +31,7 @@ CUDA compilation toolchain :toctree: generated/ Program + + :template: dataclass.rst + + ProgramOptions \ No newline at end of file diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e24786..4c615eb3 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -6,4 +6,5 @@ maxdepth: 3 --- 0.1.0 + 0.2.0 ``` From 3555e2eed747b2f402219b9a74d2b8fa09ccfc03 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 11:44:15 -0800 Subject: [PATCH 04/44] modify doc source --- cuda_core/docs/source/release/0.2.0-notes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md index 914e9c17..3ba043ce 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -3,7 +3,7 @@ Released on , 2024 ## Hightlights -- Add ProgramOptions to facilitate the passing of runtime compile options to [Program](#program) +- Add ProgramOptions to facilitate the passing of runtime compile options to `Program` ## Limitations - From a9ac448e4ac6bab862498eb35763bbc9ddf2e1d6 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 18 Dec 2024 07:40:29 -0800 Subject: [PATCH 05/44] integrate program options into the tests --- cuda_core/cuda/core/experimental/_program.py | 7 +- cuda_core/examples/strided_memory_view.py | 11 +- cuda_core/examples/thread_block_cluster.py | 12 +- cuda_core/tests/test_linker.py | 14 +- cuda_core/tests/test_program.py | 209 ++++++++++--------- 5 files changed, 127 insertions(+), 126 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 5db6f6ba..feb282df 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import weakref from dataclasses import dataclass from typing import List, Optional, Tuple, Union -import weakref from cuda import nvrtc from cuda.core.experimental._device import Device @@ -440,9 +440,10 @@ def compile(self, target_type, name_expressions=(), logs=None): if self._backend == "nvrtc": if name_expressions: for n in name_expressions: - handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle) + handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) handle_return( - nvrtc.nvrtcCompileProgram(self._handle, len(self._options), self._options), handle=self._handle + nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options), + handle=self._mnff.handle, ) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 564d7fa0..33ae26e1 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -31,7 +31,7 @@ cp = None import numpy as np -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory # ################################################################################ @@ -88,15 +88,12 @@ } } """).substitute(func_sig=func_sig) - gpu_prog = Program(gpu_code, code_type="c++") + # To know the GPU's compute capability, we need to identify which GPU to use. dev = Device(0) arch = "".join(f"{i}" for i in dev.compute_capability) - mod = gpu_prog.compile( - target_type="cubin", - # TODO: update this after NVIDIA/cuda-python#237 is merged - options=(f"-arch=sm_{arch}", "-std=c++11"), - ) + gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++11")) + mod = gpu_prog.compile(target_type="cubin") gpu_ker = mod.get_kernel(func_name) # Now we are prepared to run the code from the user's perspective! diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index fa70738d..7b0182cd 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -5,7 +5,7 @@ import os import sys -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch # prepare include cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) @@ -44,12 +44,12 @@ # prepare program & compile kernel dev.set_current() -prog = Program(code, code_type="c++") -mod = prog.compile( - target_type="cubin", - # TODO: update this after NVIDIA/cuda-python#237 is merged - options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"), +prog = Program( + code, + code_type="c++", + options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++17", include_paths=cuda_include_path), ) +mod = prog.compile(target_type="cubin") ker = mod.get_kernel("check_cluster_info") # prepare launch config diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 54cd8cf4..bc1fb109 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,7 +4,7 @@ import pytest -from cuda.core.experimental import Linker, LinkerOptions, Program, _linker +from cuda.core.experimental import Linker, LinkerOptions, Program, ProgramOptions, _linker from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper @@ -24,18 +24,18 @@ def compile_ptx_functions(init_cuda): # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced # device functions, causing the link to fail - object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",)) - object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",)) - object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",)) + object_code_a_ptx = Program(kernel_a, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx") + object_code_b_ptx = Program(device_function_b, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx") + object_code_c_ptx = Program(device_function_c, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx") return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",)) - object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",)) + object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir") + object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir") + object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir") return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index e6ae3334..5d6376f0 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -1,103 +1,106 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental._module import Kernel, ObjectCode -from cuda.core.experimental._program import Program, ProgramOptions - - -def test_program_with_various_options(init_cuda): - code = 'extern "C" __global__ void my_kernel() {}' - - options_list = [ - ProgramOptions(ptxas_options="-v"), - ProgramOptions(ptxas_options=["-v", "-O3"]), - ProgramOptions(device_optimize=True, device_debug=True), - ProgramOptions(relocatable_device_code=True, maxrregcount=32), - ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), - ProgramOptions(fmad=False, use_fast_math=True), - ProgramOptions(extra_device_vectorization=True), - ProgramOptions(dlink_time_opt=True, gen_opt_lto=True), - ProgramOptions(define_macro="MY_MACRO"), - ProgramOptions(define_macro=("MY_MACRO", "99")), - ProgramOptions(define_macro=[("MY_MACRO", "99")]), - ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]), - ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]), - ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"), - ProgramOptions(pre_include="my_header.h", no_source_include=True), - ProgramOptions(builtin_initializer_list=False, disable_warnings=True), - ProgramOptions(restrict=True, device_as_default_execution_space=True), - ProgramOptions(device_int128=True, optimization_info="inline"), - ProgramOptions(no_display_error_number=True), - ProgramOptions(diag_error="1234", diag_suppress="5678"), - ProgramOptions(diag_warn="91011", brief_diagnostics=True), - ProgramOptions(time="compile_time.csv", split_compile=4), - ProgramOptions(fdevice_syntax_only=True, minimal=True), - ] - - # TODO compile the program once the CI is set up - for options in options_list: - program = Program(code, "c++", options) - assert program.backend == "nvrtc" - program.close() - assert program.handle is None - - -def test_program_init_valid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.backend == "nvrtc" - assert program.handle is not None - - -def test_program_init_invalid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - with pytest.raises(NotImplementedError): - Program(code, "python") - - -def test_program_init_invalid_code_format(): - code = 12345 - with pytest.raises(TypeError): - Program(code, "c++") - - -def test_program_compile_valid_target_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - object_code = program.compile("ptx") - kernel = object_code.get_kernel("my_kernel") - assert isinstance(object_code, ObjectCode) - assert isinstance(kernel, Kernel) - - -def test_program_compile_invalid_target_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - with pytest.raises(NotImplementedError): - program.compile("invalid_target") - - -def test_program_backend_property(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.backend == "nvrtc" - - -def test_program_handle_property(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.handle is not None - - -def test_program_close(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - program.close() - assert program.handle is None +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest +from conftest import can_load_generated_ptx + +from cuda.core.experimental._module import Kernel, ObjectCode +from cuda.core.experimental._program import Program, ProgramOptions + + +def test_program_with_various_options(init_cuda): + code = 'extern "C" __global__ void my_kernel() {}' + + options_list = [ + ProgramOptions(ptxas_options="-v"), + ProgramOptions(ptxas_options=["-v", "-O3"]), + ProgramOptions(device_optimize=True, device_debug=True), + ProgramOptions(relocatable_device_code=True, maxrregcount=32), + ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), + ProgramOptions(fmad=False, use_fast_math=True), + ProgramOptions(extra_device_vectorization=True), + ProgramOptions(dlink_time_opt=True, gen_opt_lto=True), + ProgramOptions(define_macro="MY_MACRO"), + ProgramOptions(define_macro=("MY_MACRO", "99")), + ProgramOptions(define_macro=[("MY_MACRO", "99")]), + ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]), + ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]), + ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"), + ProgramOptions(pre_include="my_header.h", no_source_include=True), + ProgramOptions(builtin_initializer_list=False, disable_warnings=True), + ProgramOptions(restrict=True, device_as_default_execution_space=True), + ProgramOptions(device_int128=True, optimization_info="inline"), + ProgramOptions(no_display_error_number=True), + ProgramOptions(diag_error="1234", diag_suppress="5678"), + ProgramOptions(diag_warn="91011", brief_diagnostics=True), + ProgramOptions(time="compile_time.csv", split_compile=4), + ProgramOptions(fdevice_syntax_only=True, minimal=True), + ] + + # TODO compile the program once the CI is set up + for options in options_list: + program = Program(code, "c++", options) + assert program.backend == "nvrtc" + program.close() + assert program.handle is None + + +def test_program_init_valid_code_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.backend == "nvrtc" + assert program.handle is not None + + +def test_program_init_invalid_code_type(): + code = 'extern "C" __global__ void my_kernel() {}' + with pytest.raises(NotImplementedError): + Program(code, "python") + + +def test_program_init_invalid_code_format(): + code = 12345 + with pytest.raises(TypeError): + Program(code, "c++") + + +# TODO: incorporate this check in Program +@pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") +def test_program_compile_valid_target_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + object_code = program.compile("ptx") + kernel = object_code.get_kernel("my_kernel") + assert isinstance(object_code, ObjectCode) + assert isinstance(kernel, Kernel) + + +def test_program_compile_invalid_target_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + with pytest.raises(NotImplementedError): + program.compile("invalid_target") + + +def test_program_backend_property(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.backend == "nvrtc" + + +def test_program_handle_property(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.handle is not None + + +def test_program_close(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + program.close() + assert program.handle is None From 6790c83b39bbb7702cd032ae20b01491c02733a3 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Sun, 29 Dec 2024 22:04:11 -0800 Subject: [PATCH 06/44] update the attribute names for consistency across linker and program --- cuda_core/cuda/core/experimental/_program.py | 44 ++++++++++---------- cuda_core/tests/test_linker.py | 10 +++-- cuda_core/tests/test_program.py | 8 ++-- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index feb282df..08569f33 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -18,7 +18,7 @@ class ProgramOptions: Attributes ---------- - gpu_architecture : str, optional + arch : str, optional Specify the name of the class of GPU architectures for which the input must be compiled. Valid values: compute_50, compute_52, compute_53, compute_60, compute_61, compute_62, compute_70, compute_72, compute_75, compute_80, compute_87, compute_89, compute_90, compute_90a, sm_50, sm_52, sm_53, sm_60, sm_61, @@ -33,15 +33,15 @@ class ProgramOptions: Do extensible whole program compilation of device code. Default: False Maps to: --extensible-whole-program (-ewp) - device_debug : bool, optional + debug : bool, optional Generate debug information. If --dopt is not specified, then turns off all optimizations. Default: False Maps to: --device-debug (-G) - generate_line_info : bool, optional + lineinfo: bool, optional Generate line-number information. Default: False Maps to: --generate-line-info (-lineinfo) - device_optimize : bool, optional + device_code_optimize : bool, optional Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation for optimized device code. Default: None @@ -51,7 +51,7 @@ class ProgramOptions: For example ["-v", "-O2"]. Default: None Maps to: --ptxas-options (-Xptxas) - maxrregcount : int, optional + max_register_count : int, optional Specify the maximum amount of registers that GPU functions can use. Default: None Maps to: --maxrregcount= (-maxrregcount) @@ -69,7 +69,7 @@ class ProgramOptions: approximation. Default: True Maps to: --prec-div={true|false} (-prec-div) - fmad : bool, optional + fma : bool, optional Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations. Default: True @@ -82,7 +82,7 @@ class ProgramOptions: Enables more aggressive device code vectorization in the NVVM optimizer. Default: False Maps to: --extra-device-vectorization (-extra-device-vectorization) - dlink_time_opt : bool, optional + link_time_optimization : bool, optional Generate intermediate code for later link-time optimization. Default: False Maps to: --dlink-time-opt (-dlto) @@ -186,18 +186,18 @@ class ProgramOptions: device_w: Optional[bool] = None relocatable_device_code: Optional[bool] = None extensible_whole_program: Optional[bool] = None - device_debug: Optional[bool] = None - generate_line_info: Optional[bool] = None - device_optimize: Optional[bool] = None + debug: Optional[bool] = None + lineinfo: Optional[bool] = None + device_code_optimize: Optional[bool] = None ptxas_options: Optional[Union[str, List[str]]] = None - maxrregcount: Optional[int] = None + max_register_count: Optional[int] = None ftz: Optional[bool] = None prec_sqrt: Optional[bool] = None prec_div: Optional[bool] = None - fmad: Optional[bool] = None + fma: Optional[bool] = None use_fast_math: Optional[bool] = None extra_device_vectorization: Optional[bool] = None - dlink_time_opt: Optional[bool] = None + link_time_optimization: Optional[bool] = None gen_opt_lto: Optional[bool] = None define_macro: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None undefine_macro: Optional[Union[str, List[str]]] = None @@ -236,12 +236,12 @@ def __post_init__(self): ) if self.extensible_whole_program is not None and self.extensible_whole_program: self._formatted_options.append("--extensible-whole-program") - if self.device_debug is not None and self.device_debug: + if self.debug is not None and self.debug: self._formatted_options.append("--device-debug") - if self.generate_line_info is not None and self.generate_line_info: + if self.lineinfo is not None and self.lineinfo: self._formatted_options.append("--generate-line-info") - if self.device_optimize is not None: - self._formatted_options.append(f"--dopt={'on' if self.device_optimize else 'off'}") + if self.device_code_optimize is not None: + self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}") if self.ptxas_options is not None: self._formatted_options.append("--ptxas-options") if isinstance(self.ptxas_options, list): @@ -249,21 +249,21 @@ def __post_init__(self): self._formatted_options.append(option) else: self._formatted_options.append("self.ptxas_options") - if self.maxrregcount is not None: - self._formatted_options.append(f"--maxrregcount={self.maxrregcount}") + if self.max_register_count is not None: + self._formatted_options.append(f"--maxrregcount={self.max_register_count}") if self.ftz is not None: self._formatted_options.append(f"--ftz={_handle_boolean_option(self.ftz)}") if self.prec_sqrt is not None: self._formatted_options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}") if self.prec_div is not None: self._formatted_options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}") - if self.fmad is not None: - self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fmad)}") + if self.fma is not None: + self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fma)}") if self.use_fast_math is not None and self.use_fast_math: self._formatted_options.append("--use_fast_math") if self.extra_device_vectorization is not None and self.extra_device_vectorization: self._formatted_options.append("--extra-device-vectorization") - if self.dlink_time_opt is not None and self.dlink_time_opt: + if self.link_time_optimization is not None and self.link_time_optimization: self._formatted_options.append("--dlink-time-opt") if self.gen_opt_lto is not None and self.gen_opt_lto: self._formatted_options.append("--gen-opt-lto") diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index bc1fb109..83ce5208 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -33,9 +33,13 @@ def compile_ptx_functions(init_cuda): @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir") - object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir") - object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir") + object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(link_time_optimization=True)).compile("ltoir") + object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(link_time_optimization=True)).compile( + "ltoir" + ) + object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(link_time_optimization=True)).compile( + "ltoir" + ) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 5d6376f0..fd2515b9 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -19,12 +19,12 @@ def test_program_with_various_options(init_cuda): options_list = [ ProgramOptions(ptxas_options="-v"), ProgramOptions(ptxas_options=["-v", "-O3"]), - ProgramOptions(device_optimize=True, device_debug=True), - ProgramOptions(relocatable_device_code=True, maxrregcount=32), + ProgramOptions(device_code_optimize=True, debug=True), + ProgramOptions(relocatable_device_code=True, max_register_count=32), ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), - ProgramOptions(fmad=False, use_fast_math=True), + ProgramOptions(fma=False, use_fast_math=True), ProgramOptions(extra_device_vectorization=True), - ProgramOptions(dlink_time_opt=True, gen_opt_lto=True), + ProgramOptions(link_time_optimization=True, gen_opt_lto=True), ProgramOptions(define_macro="MY_MACRO"), ProgramOptions(define_macro=("MY_MACRO", "99")), ProgramOptions(define_macro=[("MY_MACRO", "99")]), From 7d5b894fda1066788ab9ae50486cb59cd1f5d5e5 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 30 Dec 2024 16:11:20 -0800 Subject: [PATCH 07/44] fix module test --- cuda_core/tests/test_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 61933d41..7db017f1 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -10,13 +10,13 @@ import pytest from conftest import can_load_generated_ptx -from cuda.core.experimental import Program +from cuda.core.experimental import Program, ProgramOptions @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") def test_get_kernel(): kernel = """extern "C" __global__ void ABC() { }""" - object_code = Program(kernel, "c++").compile("ptx", options=("-rdc=true",)) + object_code = Program(kernel, "c++", options=ProgramOptions(relocatable_device_code=True)).compile("ptx") assert object_code._handle is None kernel = object_code.get_kernel("ABC") assert object_code._handle is not None From 1789a849500416020da439d358340e58f2ea3107 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 30 Dec 2024 17:35:59 -0800 Subject: [PATCH 08/44] update the tests --- cuda_core/cuda/core/experimental/_linker.py | 11 +++++---- cuda_core/cuda/core/experimental/_program.py | 6 ++--- cuda_core/docs/source/release/0.2.0-notes.md | 24 ++++++++++---------- cuda_core/examples/saxpy.py | 2 +- cuda_core/examples/strided_memory_view.py | 2 +- cuda_core/examples/thread_block_cluster.py | 2 +- cuda_core/examples/vector_add.py | 2 +- cuda_core/tests/test_linker.py | 14 +++++++----- 8 files changed, 34 insertions(+), 29 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 2beeb168..570a6b04 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -9,6 +9,7 @@ from typing import List, Optional from cuda import cuda +from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._utils import check_or_create_options, handle_return @@ -84,10 +85,10 @@ class LinkerOptions: Attributes ---------- - arch : str + arch : str, optional Pass the SM architecture value, such as ``-arch=sm_`` (for generating CUBIN) or - ``compute_`` (for generating PTX). - This is a required option. + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. max_register_count : int, optional Maximum register count. Maps to: ``-maxrregcount=``. @@ -165,7 +166,7 @@ class LinkerOptions: Default: False. """ - arch: str + arch: Optional[str] = None max_register_count: Optional[int] = None time: Optional[bool] = None verbose: Optional[bool] = None @@ -197,6 +198,8 @@ def __post_init__(self): def _init_nvjitlink(self): if self.arch is not None: self.formatted_options.append(f"-arch={self.arch}") + else: + self.formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability)) if self.max_register_count is not None: self.formatted_options.append(f"-maxrregcount={self.max_register_count}") if self.time is not None: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 08569f33..53cc6a22 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -181,7 +181,7 @@ class ProgramOptions: Maps to: --minimal (-minimal) """ - gpu_architecture: Optional[str] = None + arch: Optional[str] = None device_c: Optional[bool] = None device_w: Optional[bool] = None relocatable_device_code: Optional[bool] = None @@ -224,8 +224,8 @@ class ProgramOptions: def __post_init__(self): self._formatted_options = [] - if self.gpu_architecture is not None: - self._formatted_options.append(f"--gpu-architecture={self.gpu_architecture}") + if self.arch is not None: + self._formatted_options.append(f"--gpu-architecture={self.arch}") else: self._formatted_options.append( "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability) diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md index 3ba043ce..a2091d4d 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -1,12 +1,12 @@ -# `cuda.core` Release notes - -Released on , 2024 - -## Hightlights -- Add ProgramOptions to facilitate the passing of runtime compile options to `Program` - -## Limitations -- - -## Breaking Changes -- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`. +# `cuda.core` Release notes + +Released on , 2024 + +## Hightlights +- Add ProgramOptions to facilitate the passing of runtime compile options to `Program` + +## Limitations +- + +## Breaking Changes +- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`. diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index 56db78da..ad8de57d 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -29,7 +29,7 @@ s = dev.create_stream() # prepare program -program_options = ProgramOptions(std="c++11", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability)) +program_options = ProgramOptions(std="c++11", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability)) prog = Program(code, code_type="c++", options=program_options) mod = prog.compile( "cubin", diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 12b917ae..ed970d50 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -93,7 +93,7 @@ dev = Device(0) dev.set_current() arch = "".join(f"{i}" for i in dev.compute_capability) - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++11")) + gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) mod = gpu_prog.compile(target_type="cubin") gpu_ker = mod.get_kernel(func_name) diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index 7b0182cd..f4333844 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -47,7 +47,7 @@ prog = Program( code, code_type="c++", - options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++17", include_paths=cuda_include_path), + options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_paths=cuda_include_path), ) mod = prog.compile(target_type="cubin") ker = mod.get_kernel("check_cluster_info") diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index b12db84d..5a42ab7b 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -26,7 +26,7 @@ s = dev.create_stream() # prepare program -program_options = ProgramOptions(std="c++17", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability)) +program_options = ProgramOptions(std="c++17", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability)) prog = Program(code, code_type="c++", options=program_options) mod = prog.compile("cubin", name_expressions=("vector_add",)) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 83ce5208..4d1f5731 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,10 +4,11 @@ import pytest -from cuda.core.experimental import Linker, LinkerOptions, Program, ProgramOptions, _linker +from cuda.bindings import nvjitlink +from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker from cuda.core.experimental._module import ObjectCode -ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper +ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability) kernel_a = """ extern __device__ int B(); @@ -45,6 +46,7 @@ def compile_ltoir_functions(init_cuda): culink_options = [ + LinkerOptions(), LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, max_register_count=32), LinkerOptions(arch=ARCH, optimization_level=3), @@ -81,10 +83,10 @@ def test_linker_init(compile_ptx_functions, options): assert isinstance(object_code, ObjectCode) -def test_linker_init_invalid_arch(): - options = LinkerOptions(arch=None) - with pytest.raises(TypeError): - Linker(options) +def test_linker_init_invalid_arch(compile_ptx_functions): + options = LinkerOptions(arch="99", ptx=True) + with pytest.raises(nvjitlink.nvJitLinkError): + Linker(*compile_ptx_functions, options=options) @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") From 2285facdaff03b13b7e3d6ba47d601289dd381ab Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 1 Jan 2025 13:52:11 -0800 Subject: [PATCH 09/44] update the tests --- cuda_core/tests/test_linker.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 4d1f5731..f4e71c43 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -85,8 +85,12 @@ def test_linker_init(compile_ptx_functions, options): def test_linker_init_invalid_arch(compile_ptx_functions): options = LinkerOptions(arch="99", ptx=True) - with pytest.raises(nvjitlink.nvJitLinkError): - Linker(*compile_ptx_functions, options=options) + if culink_backend: + with pytest.raises(AttributeError): + Linker(*compile_ptx_functions, options=options) + else: + with pytest.raises(nvjitlink.nvJitLinkError): + Linker(*compile_ptx_functions, options=options) @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") From bb6204838b5ccb4f0ffccad4a86effb4c7c3d10b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 2 Jan 2025 11:39:39 -0800 Subject: [PATCH 10/44] move ProgramOptions ctor into pytest raises --- cuda_core/tests/test_linker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index f4e71c43..9a318651 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -52,7 +52,8 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, optimization_level=3), LinkerOptions(arch=ARCH, debug=True), LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, no_cache=True), + LinkerOptions(arch=ARCH, no_cache=True), # TODO: consider adding cuda 12.4 to test matrix in which case this + # will fail. Tracked in issue #337 ] @@ -78,18 +79,20 @@ def compile_ltoir_functions(init_cuda): ], ) def test_linker_init(compile_ptx_functions, options): + print(culink_backend) linker = Linker(*compile_ptx_functions, options=options) object_code = linker.link("cubin") assert isinstance(object_code, ObjectCode) def test_linker_init_invalid_arch(compile_ptx_functions): - options = LinkerOptions(arch="99", ptx=True) if culink_backend: with pytest.raises(AttributeError): + options = LinkerOptions(arch="99", ptx=True) Linker(*compile_ptx_functions, options=options) else: with pytest.raises(nvjitlink.nvJitLinkError): + options = LinkerOptions(arch="99", ptx=True) Linker(*compile_ptx_functions, options=options) From 0f0ca9b9477995631113df7aaec2918ee2a4a655 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 2 Jan 2025 12:00:59 -0800 Subject: [PATCH 11/44] only import nvjitlink if its available --- cuda_core/tests/test_linker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 9a318651..0073c4c9 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,7 +4,6 @@ import pytest -from cuda.bindings import nvjitlink from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker from cuda.core.experimental._module import ObjectCode @@ -19,6 +18,8 @@ device_function_c = "__device__ int C(int a, int b) { return a + b; }" culink_backend = _linker._decide_nvjitlink_or_driver() +if not culink_backend: + from cuda.bindings import nvjitlink @pytest.fixture(scope="function") From 58f2b09f88b275c95009eeb692aa4bfac2886c24 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Fri, 3 Jan 2025 14:53:51 -0800 Subject: [PATCH 12/44] Update cuda_core/examples/saxpy.py Co-authored-by: Leo Fang --- cuda_core/examples/saxpy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index ad8de57d..8fc4b91b 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -29,7 +29,8 @@ s = dev.create_stream() # prepare program -program_options = ProgramOptions(std="c++11", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability)) +arch = "".join(f"{i}" for i in dev.compute_capability) +program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}") prog = Program(code, code_type="c++", options=program_options) mod = prog.compile( "cubin", From 7afe54e9d84650ec7f2055d356edb3c94eb04810 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Fri, 3 Jan 2025 14:54:19 -0800 Subject: [PATCH 13/44] Update cuda_core/cuda/core/experimental/_program.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_program.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 53cc6a22..c8de3f86 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -403,10 +403,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): else: raise NotImplementedError - if options is None: - self._options = [] - else: - self._options = options._as_bytes() + self._options = options = check_or_create_options(ProgramOptions, options, "Program options") def close(self): """Destroy this program.""" From 653a3e14b4315e7d0e175d843b81ab5aadd79bf9 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Fri, 3 Jan 2025 14:54:27 -0800 Subject: [PATCH 14/44] Update cuda_core/cuda/core/experimental/_program.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index c8de3f86..034ffafa 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -439,7 +439,7 @@ def compile(self, target_type, name_expressions=(), logs=None): for n in name_expressions: handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) handle_return( - nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options), + nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options._as_bytes()), handle=self._mnff.handle, ) From 5161a43cef5fa619d12ca49edac5bbd8d9ed1805 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Fri, 3 Jan 2025 14:54:42 -0800 Subject: [PATCH 15/44] Update cuda_core/cuda/core/experimental/_program.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_program.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 034ffafa..bdcff3ce 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -342,10 +342,7 @@ def __post_init__(self): def _as_bytes(self): # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - result = [] - for option in self._formatted_options: - result.append(option.encode()) - return result + return list(o.encode() for o in self._formatted_options) def __repr__(self): # __TODO__ improve this From f461eec29f15f1595d26781a9cab95c2d6051e7c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 3 Jan 2025 15:11:29 -0800 Subject: [PATCH 16/44] tweak doc source --- cuda_core/docs/source/index.rst | 2 +- cuda_core/docs/source/release.md | 11 ----------- cuda_core/docs/source/release.rst | 11 +++++++++++ .../release/{0.1.0-notes.md => 0.1.0-notes.rst} | 5 ++++- .../release/{0.1.1-notes.md => 0.1.1-notes.rst} | 5 ++++- .../release/{0.2.0-notes.md => 0.2.0-notes.rst} | 3 +++ 6 files changed, 23 insertions(+), 14 deletions(-) delete mode 100644 cuda_core/docs/source/release.md create mode 100644 cuda_core/docs/source/release.rst rename cuda_core/docs/source/release/{0.1.0-notes.md => 0.1.0-notes.rst} (86%) rename cuda_core/docs/source/release/{0.1.1-notes.md => 0.1.1-notes.rst} (95%) rename cuda_core/docs/source/release/{0.2.0-notes.md => 0.2.0-notes.rst} (85%) diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst index 19c14c2a..1aedfaa8 100644 --- a/cuda_core/docs/source/index.rst +++ b/cuda_core/docs/source/index.rst @@ -8,7 +8,7 @@ and other functionalities. :maxdepth: 2 :caption: Contents: - release.md + release.rst install.md interoperability.rst api.rst diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md deleted file mode 100644 index fa32175f..00000000 --- a/cuda_core/docs/source/release.md +++ /dev/null @@ -1,11 +0,0 @@ -# Release Notes - -```{toctree} ---- -maxdepth: 3 ---- - - 0.1.1 - 0.1.0 - 0.2.0 -``` diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst new file mode 100644 index 00000000..f2ae8ea0 --- /dev/null +++ b/cuda_core/docs/source/release.rst @@ -0,0 +1,11 @@ +# Release Notes + +Release Notes +============= + +.. toctree:: + :maxdepth: 3 + + release/0.2.0-notes + release/0.1.1-notes + release/0.1.0-notes diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.rst similarity index 86% rename from cuda_core/docs/source/release/0.1.0-notes.md rename to cuda_core/docs/source/release/0.1.0-notes.rst index 5d1c7fd5..83c4e56b 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -1,4 +1,7 @@ -# `cuda.core` v0.1.0 Release notes +# `cuda.core` Release notes + +`cuda.core` 0.1.0 Release Notes +=============================== Released on Nov 8, 2024 diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.rst similarity index 95% rename from cuda_core/docs/source/release/0.1.1-notes.md rename to cuda_core/docs/source/release/0.1.1-notes.rst index 9550cb23..8903a32a 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -1,4 +1,7 @@ -# `cuda.core` v0.1.1 Release notes +# `cuda.core` Release notes + +`cuda.core` 0.1.1 Release Notes +=============================== Released on Dec 20, 2024 diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.rst similarity index 85% rename from cuda_core/docs/source/release/0.2.0-notes.md rename to cuda_core/docs/source/release/0.2.0-notes.rst index a2091d4d..852875b2 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -1,5 +1,8 @@ # `cuda.core` Release notes +`cuda.core` 0.2.0 Release Notes +=============================== + Released on , 2024 ## Hightlights From 236db71429f9b70fe0f28984b229751124f878da Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 3 Jan 2025 15:28:51 -0800 Subject: [PATCH 17/44] tweak docs --- cuda_core/docs/source/release/0.1.0-notes.rst | 13 +++++----- cuda_core/docs/source/release/0.1.1-notes.rst | 25 +++++++++++-------- cuda_core/docs/source/release/0.2.0-notes.rst | 20 +++++++++------ 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst index 83c4e56b..fb1d8b77 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.rst +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -1,20 +1,21 @@ -# `cuda.core` Release notes - `cuda.core` 0.1.0 Release Notes =============================== Released on Nov 8, 2024 -## Hightlights +Highlights +---------- + - Initial beta release - Supports all platforms that CUDA is supported - Supports all CUDA 11.x/12.x drivers - Supports all CUDA 11.x/12.x Toolkits - Pythonic CUDA runtime and other core functionalities -## Limitations +Limitations +----------- - All APIs are currently *experimental* and subject to change without deprecation notice. - Please kindly share your feedbacks with us so that we can make `cuda.core` better! + Please kindly share your feedback with us so that we can make `cuda.core` better! - Source code release only; `pip`/`conda` support is coming in a future release -- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) +- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) \ No newline at end of file diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index 8903a32a..639a0559 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -1,11 +1,10 @@ -# `cuda.core` Release notes - `cuda.core` 0.1.1 Release Notes =============================== Released on Dec 20, 2024 -## Hightlights +Highlights +---------- - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. @@ -14,12 +13,14 @@ Released on Dec 20, 2024 detected in the current environment. - Support `pip install cuda-core`. Please see the Installation Guide for further details. -## New features +New features +------------ -- Add a `cuda.core.experimental.system` module for querying system- or process- wide information. +- Add a `cuda.core.experimental.system` module for querying system- or process-wide information. - Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs. -## Enhancements +Enhancements +------------ - The internal handle held by `ObjectCode` is now lazily initialized upon first touch. - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools. @@ -28,19 +29,21 @@ Released on Dec 20, 2024 - Improve test coverage & documentation cross-references. - Enforce code formatting. -## Bug fixes +Bug fixes +--------- - Eliminate potential class destruction issues. - Fix circular import during handling a foreign CUDA stream. -## Limitations +Limitations +----------- - All APIs are currently *experimental* and subject to change without deprecation notice. - Please kindly share your feedbacks with us so that we can make `cuda.core` better! + Please kindly share your feedback with us so that we can make `cuda.core` better! - Using `cuda.core` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently not supported. This will be fixed in a future release. - Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, - the backend is the cuLink api which supports only a subset of the options that nvjitlink does. + the backend is the cuLink API which supports only a subset of the options that nvjitlink does. Further, some options aren't available on CUDA versions <12.6. - To use `cuda.core` with Python 3.13, it currently requires building `cuda-python` from source - prior to `pip install`. This extra step will be fixed soon. + prior to `pip install`. This extra step will be fixed soon. \ No newline at end of file diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 852875b2..75ed3f95 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -1,15 +1,19 @@ -# `cuda.core` Release notes - `cuda.core` 0.2.0 Release Notes =============================== Released on , 2024 -## Hightlights -- Add ProgramOptions to facilitate the passing of runtime compile options to `Program` +Highlights +---------- + +- Add `ProgramOptions` to facilitate the passing of runtime compile options to `Program`. + +Limitations +----------- + +- -## Limitations -- +Breaking Changes +---------------- -## Breaking Changes -- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`. +- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`. \ No newline at end of file From 2f74ca3c53b09bbe7eafc0d471db8b2beb76a3c7 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 3 Jan 2025 15:43:58 -0800 Subject: [PATCH 18/44] tweak fix --- cuda_core/cuda/core/experimental/_program.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index bdcff3ce..0e8c8207 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -9,7 +9,7 @@ from cuda import nvrtc from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import _handle_boolean_option, handle_return +from cuda.core.experimental._utils import _handle_boolean_option, check_or_create_options, handle_return @dataclass @@ -435,8 +435,9 @@ def compile(self, target_type, name_expressions=(), logs=None): if name_expressions: for n in name_expressions: handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) + options = self._options._as_bytes() handle_return( - nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options._as_bytes()), + nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), handle=self._mnff.handle, ) From 9c88ba72629be37e6a7a838c9687d83672df47b0 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 6 Jan 2025 09:01:52 -0800 Subject: [PATCH 19/44] Update cuda_core/cuda/core/experimental/_linker.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 81357e07..54b14395 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -94,7 +94,7 @@ class LinkerOptions: Attributes ---------- arch : str, optional - Pass the SM architecture value, such as ``-arch=sm_`` (for generating CUBIN) or + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or ``compute_`` (for generating PTX). If not provided, the current device's architecture will be used. max_register_count : int, optional From 133f6aa42ee581edfb59f70ff0171f27bf5e703c Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 6 Jan 2025 09:02:03 -0800 Subject: [PATCH 20/44] Update cuda_core/docs/source/release.rst Co-authored-by: Leo Fang --- cuda_core/docs/source/release.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst index f2ae8ea0..4e74423f 100644 --- a/cuda_core/docs/source/release.rst +++ b/cuda_core/docs/source/release.rst @@ -1,5 +1,3 @@ -# Release Notes - Release Notes ============= From cb06afcdc29a277a6b170e44cf32887647910fa9 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 6 Jan 2025 09:02:47 -0800 Subject: [PATCH 21/44] Update cuda_core/docs/source/release/0.1.0-notes.rst Co-authored-by: Leo Fang --- cuda_core/docs/source/release/0.1.0-notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst index fb1d8b77..a342ccdd 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.rst +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -18,4 +18,4 @@ Limitations - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedback with us so that we can make `cuda.core` better! - Source code release only; `pip`/`conda` support is coming in a future release -- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) \ No newline at end of file +- Windows TCC mode is `not yet supported `_ \ No newline at end of file From 3578f949d24794be2b8c0044b1661012a2217b6f Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 09:19:04 -0800 Subject: [PATCH 22/44] fix tests --- cuda_core/tests/test_program.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index fd2515b9..3f4e6b85 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -17,8 +17,6 @@ def test_program_with_various_options(init_cuda): code = 'extern "C" __global__ void my_kernel() {}' options_list = [ - ProgramOptions(ptxas_options="-v"), - ProgramOptions(ptxas_options=["-v", "-O3"]), ProgramOptions(device_code_optimize=True, debug=True), ProgramOptions(relocatable_device_code=True, max_register_count=32), ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), @@ -31,21 +29,20 @@ def test_program_with_various_options(init_cuda): ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]), ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]), ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"), - ProgramOptions(pre_include="my_header.h", no_source_include=True), ProgramOptions(builtin_initializer_list=False, disable_warnings=True), ProgramOptions(restrict=True, device_as_default_execution_space=True), ProgramOptions(device_int128=True, optimization_info="inline"), ProgramOptions(no_display_error_number=True), - ProgramOptions(diag_error="1234", diag_suppress="5678"), - ProgramOptions(diag_warn="91011", brief_diagnostics=True), + ProgramOptions(diag_error="1234", diag_suppress="1234"), + ProgramOptions(diag_warn="1000", brief_diagnostics=True), ProgramOptions(time="compile_time.csv", split_compile=4), ProgramOptions(fdevice_syntax_only=True, minimal=True), ] - # TODO compile the program once the CI is set up for options in options_list: program = Program(code, "c++", options) assert program.backend == "nvrtc" + program.compile("ptx") program.close() assert program.handle is None @@ -70,6 +67,7 @@ def test_program_init_invalid_code_format(): # TODO: incorporate this check in Program +# This is tested against the current device's arch @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") def test_program_compile_valid_target_type(): code = 'extern "C" __global__ void my_kernel() {}' From 261588c215b7797f3d120c9683bdda2dbfaccc80 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 09:21:07 -0800 Subject: [PATCH 23/44] fix quotes --- cuda_core/docs/source/release/0.1.0-notes.rst | 4 +-- cuda_core/docs/source/release/0.1.1-notes.rst | 28 +++++++++---------- cuda_core/docs/source/release/0.2.0-notes.rst | 6 ++-- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst index fb1d8b77..2fff35db 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.rst +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -1,4 +1,4 @@ -`cuda.core` 0.1.0 Release Notes +``cuda.core`` 0.1.0 Release Notes =============================== Released on Nov 8, 2024 @@ -16,6 +16,6 @@ Limitations ----------- - All APIs are currently *experimental* and subject to change without deprecation notice. - Please kindly share your feedback with us so that we can make `cuda.core` better! + Please kindly share your feedback with us so that we can make ``cuda.core`` better! - Source code release only; `pip`/`conda` support is coming in a future release - Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) \ No newline at end of file diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index 639a0559..d1d2b93d 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -1,4 +1,4 @@ -`cuda.core` 0.1.1 Release Notes +``cuda.core`` 0.1.1 Release Notes =============================== Released on Dec 20, 2024 @@ -6,26 +6,26 @@ Released on Dec 20, 2024 Highlights ---------- -- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete +- Add ``StridedMemoryView`` and ``@args_viewable_as_strided_memory`` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`. Under - the hood, it uses either the nvJitLink or driver (`cuLink*`) APIs depending on the CUDA version +- Add ``Linker`` that can link one or multiple ``ObjectCode`` instances generated by ``Program``. Under + the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version detected in the current environment. -- Support `pip install cuda-core`. Please see the Installation Guide for further details. +- Support ``pip install cuda-core``. Please see the Installation Guide for further details. New features ------------ -- Add a `cuda.core.experimental.system` module for querying system- or process-wide information. -- Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs. +- Add a ``cuda.core.experimental.system`` module for querying system- or process-wide information. +- Add ``LaunchConfig.cluster`` to support thread block clusters on Hopper GPUs. Enhancements ------------ -- The internal handle held by `ObjectCode` is now lazily initialized upon first touch. +- The internal handle held by ``ObjectCode`` is now lazily initialized upon first touch. - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools. -- Ensure `"ltoir"` is a valid code type to `ObjectCode`. -- Document the `__cuda_stream__` protocol. +- Ensure ``"ltoir"`` is a valid code type to ``ObjectCode``. +- Document the ``__cuda_stream__`` protocol. - Improve test coverage & documentation cross-references. - Enforce code formatting. @@ -39,11 +39,11 @@ Limitations ----------- - All APIs are currently *experimental* and subject to change without deprecation notice. - Please kindly share your feedback with us so that we can make `cuda.core` better! -- Using `cuda.core` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently + Please kindly share your feedback with us so that we can make ``cuda.core`` better! +- Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently not supported. This will be fixed in a future release. -- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, +- Some ``LinkerOptions`` are only available when using a modern version of CUDA. When using CUDA <12, the backend is the cuLink API which supports only a subset of the options that nvjitlink does. Further, some options aren't available on CUDA versions <12.6. -- To use `cuda.core` with Python 3.13, it currently requires building `cuda-python` from source +- To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source prior to `pip install`. This extra step will be fixed soon. \ No newline at end of file diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 75ed3f95..38f1ef29 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -1,4 +1,4 @@ -`cuda.core` 0.2.0 Release Notes +``cuda.core`` 0.2.0 Release Notes =============================== Released on , 2024 @@ -6,7 +6,7 @@ Released on , 2024 Highlights ---------- -- Add `ProgramOptions` to facilitate the passing of runtime compile options to `Program`. +- Add ``ProgramOptions`` to facilitate the passing of runtime compile options to ``Program``. Limitations ----------- @@ -16,4 +16,4 @@ Limitations Breaking Changes ---------------- -- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`. \ No newline at end of file +- The ``Program.Compile`` method no longer accepts an options argument. Instead, you can optionally pass an instance of ``ProgramOptions`` to the constructor of ``Program``. \ No newline at end of file From 1abc9f69ca01cf7c4628cbc7a9e75e2331648a9a Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 09:22:19 -0800 Subject: [PATCH 24/44] remove print --- cuda_core/tests/test_linker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 0073c4c9..37424d7b 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -80,7 +80,6 @@ def compile_ltoir_functions(init_cuda): ], ) def test_linker_init(compile_ptx_functions, options): - print(culink_backend) linker = Linker(*compile_ptx_functions, options=options) object_code = linker.link("cubin") assert isinstance(object_code, ObjectCode) From d20dcfa1c6c845f2e36388c98fe2801ff7fb305b Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 6 Jan 2025 09:23:36 -0800 Subject: [PATCH 25/44] Update cuda_core/cuda/core/experimental/_utils.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index 74488a53..f3d7306b 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -91,7 +91,7 @@ def _handle_boolean_option(option: bool) -> str: """ Convert a boolean option to a string representation. """ - return str(option).lower() + return "true" if bool(option) else "false" def precondition(checker: Callable[..., None], what: str = "") -> Callable: From 79fad7ac771bc4bcc8901116b00bb9d14db73e67 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 09:24:23 -0800 Subject: [PATCH 26/44] fix titles --- cuda_core/docs/source/release/0.1.0-notes.rst | 2 +- cuda_core/docs/source/release/0.1.1-notes.rst | 2 +- cuda_core/docs/source/release/0.2.0-notes.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst index de14128d..a5affba2 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.rst +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -1,5 +1,5 @@ ``cuda.core`` 0.1.0 Release Notes -=============================== +================================= Released on Nov 8, 2024 diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index d1d2b93d..53ff2ce5 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -1,5 +1,5 @@ ``cuda.core`` 0.1.1 Release Notes -=============================== +================================= Released on Dec 20, 2024 diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 38f1ef29..8981a7c1 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -1,5 +1,5 @@ ``cuda.core`` 0.2.0 Release Notes -=============================== +================================= Released on , 2024 From ec9fac1952d9923b3e467d9adb050352dfd6f161 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 10:34:41 -0800 Subject: [PATCH 27/44] remove some options --- cuda_core/tests/test_program.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 3f4e6b85..bb3ac47b 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -22,7 +22,7 @@ def test_program_with_various_options(init_cuda): ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), ProgramOptions(fma=False, use_fast_math=True), ProgramOptions(extra_device_vectorization=True), - ProgramOptions(link_time_optimization=True, gen_opt_lto=True), + ProgramOptions(link_time_optimization=True), ProgramOptions(define_macro="MY_MACRO"), ProgramOptions(define_macro=("MY_MACRO", "99")), ProgramOptions(define_macro=[("MY_MACRO", "99")]), @@ -34,7 +34,7 @@ def test_program_with_various_options(init_cuda): ProgramOptions(device_int128=True, optimization_info="inline"), ProgramOptions(no_display_error_number=True), ProgramOptions(diag_error="1234", diag_suppress="1234"), - ProgramOptions(diag_warn="1000", brief_diagnostics=True), + ProgramOptions(diag_warn="1000"), ProgramOptions(time="compile_time.csv", split_compile=4), ProgramOptions(fdevice_syntax_only=True, minimal=True), ] From f55dcdcb4f69f9257b8d3f80db473a0a5f773a57 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 10:35:45 -0800 Subject: [PATCH 28/44] add TODO --- cuda_core/tests/test_program.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index bb3ac47b..4d06bf90 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -13,6 +13,7 @@ from cuda.core.experimental._program import Program, ProgramOptions +# TODO handle and test options whcih are only supported on more modern CUDA versions def test_program_with_various_options(init_cuda): code = 'extern "C" __global__ void my_kernel() {}' From b41d119e9dc1b43dc9756daaaeb688be58cf1b94 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 10:54:33 -0800 Subject: [PATCH 29/44] remove option --- cuda_core/tests/test_program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 4d06bf90..d4e67ccc 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -36,7 +36,7 @@ def test_program_with_various_options(init_cuda): ProgramOptions(no_display_error_number=True), ProgramOptions(diag_error="1234", diag_suppress="1234"), ProgramOptions(diag_warn="1000"), - ProgramOptions(time="compile_time.csv", split_compile=4), + ProgramOptions(split_compile=4), ProgramOptions(fdevice_syntax_only=True, minimal=True), ] From de588dead8636a8dcb6b9f228ad691f9c09c241b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 11:17:12 -0800 Subject: [PATCH 30/44] remove options, should pass. --- cuda_core/tests/test_program.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index d4e67ccc..54947053 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -36,8 +36,6 @@ def test_program_with_various_options(init_cuda): ProgramOptions(no_display_error_number=True), ProgramOptions(diag_error="1234", diag_suppress="1234"), ProgramOptions(diag_warn="1000"), - ProgramOptions(split_compile=4), - ProgramOptions(fdevice_syntax_only=True, minimal=True), ] for options in options_list: From 2fbca700983f238ac3dbfd8ae40b9e329407c4ea Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 11:17:49 -0800 Subject: [PATCH 31/44] add issue tracking info --- cuda_core/tests/test_program.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 54947053..bbfc4079 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -14,6 +14,7 @@ # TODO handle and test options whcih are only supported on more modern CUDA versions +# tracked in #337 def test_program_with_various_options(init_cuda): code = 'extern "C" __global__ void my_kernel() {}' From b79dcccb49481352e496291c1808fec9c137dfb4 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 6 Jan 2025 12:28:40 -0800 Subject: [PATCH 32/44] fix include path argument --- cuda_core/examples/thread_block_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index f4333844..dfbe71a5 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -47,7 +47,7 @@ prog = Program( code, code_type="c++", - options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_paths=cuda_include_path), + options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path), ) mod = prog.compile(target_type="cubin") ker = mod.get_kernel("check_cluster_info") From bf32370cc653b42a3c6c6bdaa011d8c22211ae33 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 8 Jan 2025 09:33:35 -0800 Subject: [PATCH 33/44] fix the rest format --- cuda_core/docs/source/release/0.1.0-notes.rst | 2 +- cuda_core/docs/source/release/0.1.1-notes.rst | 18 ++++++++++-------- cuda_core/docs/source/release/0.2.0-notes.rst | 6 ++++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst index a5affba2..bc327971 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.rst +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -18,4 +18,4 @@ Limitations - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedback with us so that we can make ``cuda.core`` better! - Source code release only; `pip`/`conda` support is coming in a future release -- Windows TCC mode is `not yet supported `_ \ No newline at end of file +- Windows TCC mode is `not yet supported `_ diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index 53ff2ce5..68cca671 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -1,3 +1,5 @@ +.. currentmodule:: cuda.core.experimental + ``cuda.core`` 0.1.1 Release Notes ================================= @@ -6,9 +8,9 @@ Released on Dec 20, 2024 Highlights ---------- -- Add ``StridedMemoryView`` and ``@args_viewable_as_strided_memory`` that provide a concrete +- Add :obj:`~utils.StridedMemoryView` and :func:`~utils.args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Add ``Linker`` that can link one or multiple ``ObjectCode`` instances generated by ``Program``. Under +- Add :obj:`~Linker` that can link one or multiple :obj:`~_module.ObjectCode` instances generated by :obj:`~Program`. Under the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version detected in the current environment. - Support ``pip install cuda-core``. Please see the Installation Guide for further details. @@ -16,15 +18,15 @@ Highlights New features ------------ -- Add a ``cuda.core.experimental.system`` module for querying system- or process-wide information. -- Add ``LaunchConfig.cluster`` to support thread block clusters on Hopper GPUs. +- Add a :obj:`cuda.core.experiemental.system` module for querying system- or process-wide information. +- Add :obj:`~LaunchConfig.cluster` to support thread block clusters on Hopper GPUs. Enhancements ------------ -- The internal handle held by ``ObjectCode`` is now lazily initialized upon first touch. +- The internal handle held by :obj:`~_module.ObjectCode` is now lazily initialized upon first touch. - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools. -- Ensure ``"ltoir"`` is a valid code type to ``ObjectCode``. +- Ensure ``"ltoir"`` is a valid code type to :obj:`~_module.ObjectCode`. - Document the ``__cuda_stream__`` protocol. - Improve test coverage & documentation cross-references. - Enforce code formatting. @@ -42,8 +44,8 @@ Limitations Please kindly share your feedback with us so that we can make ``cuda.core`` better! - Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently not supported. This will be fixed in a future release. -- Some ``LinkerOptions`` are only available when using a modern version of CUDA. When using CUDA <12, +- Some :class:`~LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, the backend is the cuLink API which supports only a subset of the options that nvjitlink does. Further, some options aren't available on CUDA versions <12.6. - To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source - prior to `pip install`. This extra step will be fixed soon. \ No newline at end of file + prior to `pip install`. This extra step will be fixed soon. diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 8981a7c1..3a8b5a78 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -1,3 +1,5 @@ +.. currentmodule:: cuda.core.experimental + ``cuda.core`` 0.2.0 Release Notes ================================= @@ -6,7 +8,7 @@ Released on , 2024 Highlights ---------- -- Add ``ProgramOptions`` to facilitate the passing of runtime compile options to ``Program``. +- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`. Limitations ----------- @@ -16,4 +18,4 @@ Limitations Breaking Changes ---------------- -- The ``Program.Compile`` method no longer accepts an options argument. Instead, you can optionally pass an instance of ``ProgramOptions`` to the constructor of ``Program``. \ No newline at end of file +- The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`. From 34c87805b9683444fc5aac351436eaf9f68251d3 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 8 Jan 2025 10:42:12 -0800 Subject: [PATCH 34/44] handle nested tuples within lists and tuples, and fix the handling of error number options --- cuda_core/cuda/core/experimental/_program.py | 73 ++++++++++++-------- cuda_core/cuda/core/experimental/_utils.py | 16 ++++- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 0e8c8207..a685458a 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -9,12 +9,17 @@ from cuda import nvrtc from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import _handle_boolean_option, check_or_create_options, handle_return +from cuda.core.experimental._utils import ( + _handle_boolean_option, + check_or_create_options, + handle_return, + is_sequence, +) @dataclass class ProgramOptions: - """Customizable :obj:`ProgramOptions` for NVRTC. + """Customizable options for configuring `Program`. Attributes ---------- @@ -147,16 +152,16 @@ class ProgramOptions: Disable the display of a diagnostic number for warning messages. Default: False Maps to: --no-display-error-number (-no-err-no) - diag_error : str, optional - Emit error for specified diagnostic message number(s). + diag_error : Union[int, List[int]], optional + Emit error for a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: --diag-error=,… (-diag-error) - diag_suppress : str, optional - Suppress specified diagnostic message number(s). + Maps to: --diag-error=, ... (-diag-error) + diag_suppress : Union[int, List[int]], optional + Suppress a specified diagnostic message number or comma separated list of numbers. Default: None Maps to: --diag-suppress=,… (-diag-suppress) - diag_warn : str, optional - Emit warning for specified diagnostic message number(s). + diag_warn : Union[int, List[int]], optional + Emit warning for a specified diagnostic message number or comma separated lis of numbers. Default: None Maps to: --diag-warn=,… (-diag-warn) brief_diagnostics : bool, optional @@ -182,14 +187,12 @@ class ProgramOptions: """ arch: Optional[str] = None - device_c: Optional[bool] = None - device_w: Optional[bool] = None relocatable_device_code: Optional[bool] = None extensible_whole_program: Optional[bool] = None debug: Optional[bool] = None lineinfo: Optional[bool] = None device_code_optimize: Optional[bool] = None - ptxas_options: Optional[Union[str, List[str]]] = None + ptxas_options: Optional[Union[str, List[str], Tuple[str]]] = None max_register_count: Optional[int] = None ftz: Optional[bool] = None prec_sqrt: Optional[bool] = None @@ -199,10 +202,12 @@ class ProgramOptions: extra_device_vectorization: Optional[bool] = None link_time_optimization: Optional[bool] = None gen_opt_lto: Optional[bool] = None - define_macro: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None - undefine_macro: Optional[Union[str, List[str]]] = None - include_path: Optional[Union[str, List[str]]] = None - pre_include: Optional[Union[str, List[str]]] = None + define_macro: Optional[ + Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]], Tuple[Union[str, Tuple[str, str]]]] + ] = None + undefine_macro: Optional[Union[str, List[str], Tuple[str]]] = None + include_path: Optional[Union[str, List[str], Tuple[str]]] = None + pre_include: Optional[Union[str, List[str], Tuple[str]]] = None no_source_include: Optional[bool] = None std: Optional[str] = None builtin_move_forward: Optional[bool] = None @@ -213,9 +218,9 @@ class ProgramOptions: device_int128: Optional[bool] = None optimization_info: Optional[str] = None no_display_error_number: Optional[bool] = None - diag_error: Optional[str] = None - diag_suppress: Optional[str] = None - diag_warn: Optional[str] = None + diag_error: Optional[Union[int, List[int], Tuple[int]]] = None + diag_suppress: Optional[Union[int, List[int], Tuple[int]]] = None + diag_warn: Optional[Union[int, List[int], Tuple[int]]] = None brief_diagnostics: Optional[bool] = None time: Optional[str] = None split_compile: Optional[int] = None @@ -244,11 +249,11 @@ def __post_init__(self): self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}") if self.ptxas_options is not None: self._formatted_options.append("--ptxas-options") - if isinstance(self.ptxas_options, list): + if is_sequence(self.ptxas_options): for option in self.ptxas_options: self._formatted_options.append(option) else: - self._formatted_options.append("self.ptxas_options") + self._formatted_options.append(self.ptxas_options) if self.max_register_count is not None: self._formatted_options.append(f"--maxrregcount={self.max_register_count}") if self.ftz is not None: @@ -268,7 +273,7 @@ def __post_init__(self): if self.gen_opt_lto is not None and self.gen_opt_lto: self._formatted_options.append("--gen-opt-lto") if self.define_macro is not None: - if isinstance(self.define_macro, list): + if is_sequence(self.define_macro): for macro in self.define_macro: if isinstance(macro, tuple): assert len(macro) == 2 @@ -282,19 +287,19 @@ def __post_init__(self): self._formatted_options.append(f"--define-macro={self.define_macro}") if self.undefine_macro is not None: - if isinstance(self.undefine_macro, list): + if is_sequence(self.undefine_macro): for macro in self.undefine_macro: self._formatted_options.append(f"--undefine-macro={macro}") else: self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") if self.include_path is not None: - if isinstance(self.include_path, list): + if is_sequence(self.include_path): for path in self.include_path: self._formatted_options.append(f"--include-path={path}") else: self._formatted_options.append(f"--include-path={self.include_path}") if self.pre_include is not None: - if isinstance(self.pre_include, list): + if is_sequence(self.pre_include): for header in self.pre_include: self._formatted_options.append(f"--pre-include={header}") else: @@ -324,11 +329,23 @@ def __post_init__(self): if self.no_display_error_number is not None and self.no_display_error_number: self._formatted_options.append("--no-display-error-number") if self.diag_error is not None: - self._formatted_options.append(f"--diag-error={self.diag_error}") + if is_sequence(self.diag_error): + for error in self.diag_error: + self._formatted_options.append(f"--diag-error={error}") + else: + self._formatted_options.append(f"--diag-error={self.diag_error}") if self.diag_suppress is not None: - self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") + if is_sequence(self.diag_suppress): + for suppress in self.diag_suppress: + self._formatted_options.append(f"--diag-suppress={suppress}") + else: + self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") if self.diag_warn is not None: - self._formatted_options.append(f"--diag-warn={self.diag_warn}") + if is_sequence(self.diag_warn): + for warn in self.diag_warn: + self._formatted_options.append(f"--diag-warn={warn}") + else: + self._formatted_options.append(f"--diag-warn={self.diag_warn}") if self.brief_diagnostics is not None: self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}") if self.time is not None: diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index f3d7306b..8cb6abaf 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import functools -from collections import namedtuple +from collections import Sequence, namedtuple from typing import Callable, Dict from cuda import cuda, cudart, nvrtc @@ -141,3 +141,17 @@ def get_device_from_ctx(ctx_handle) -> int: assert ctx_handle == handle_return(cuda.cuCtxPopCurrent()) handle_return(cuda.cuCtxPushCurrent(prev_ctx)) return device_id + + +def is_sequence(obj): + """ + Check if the given object is a sequence (list or tuple). + """ + return isinstance(obj, Sequence) + + +def is_nested_sequence(obj): + """ + Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element). + """ + return is_sequence(obj) and any(is_sequence(elem) for elem in obj) From cc259606e3a74ad52ae16049b483f18e55457006 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 8 Jan 2025 11:29:56 -0800 Subject: [PATCH 35/44] change from sequence to list or tuple --- cuda_core/cuda/core/experimental/_program.py | 21 ++++++++++---------- cuda_core/cuda/core/experimental/_utils.py | 10 +++++----- cuda_core/tests/test_program.py | 3 ++- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index a685458a..bd5c18aa 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -13,7 +13,8 @@ _handle_boolean_option, check_or_create_options, handle_return, - is_sequence, + is_list_or_tuple, + is_nested_list_or_tuple, ) @@ -249,7 +250,7 @@ def __post_init__(self): self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}") if self.ptxas_options is not None: self._formatted_options.append("--ptxas-options") - if is_sequence(self.ptxas_options): + if is_list_or_tuple(self.ptxas_options): for option in self.ptxas_options: self._formatted_options.append(option) else: @@ -273,7 +274,7 @@ def __post_init__(self): if self.gen_opt_lto is not None and self.gen_opt_lto: self._formatted_options.append("--gen-opt-lto") if self.define_macro is not None: - if is_sequence(self.define_macro): + if is_nested_list_or_tuple(self.define_macro): for macro in self.define_macro: if isinstance(macro, tuple): assert len(macro) == 2 @@ -282,24 +283,24 @@ def __post_init__(self): self._formatted_options.append(f"--define-macro={macro}") elif isinstance(self.define_macro, tuple): assert len(self.define_macro) == 2 - self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}") + self._formatted_options.append("--define-macro=MY_MACRO=999") else: self._formatted_options.append(f"--define-macro={self.define_macro}") if self.undefine_macro is not None: - if is_sequence(self.undefine_macro): + if is_list_or_tuple(self.undefine_macro): for macro in self.undefine_macro: self._formatted_options.append(f"--undefine-macro={macro}") else: self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") if self.include_path is not None: - if is_sequence(self.include_path): + if is_list_or_tuple(self.include_path): for path in self.include_path: self._formatted_options.append(f"--include-path={path}") else: self._formatted_options.append(f"--include-path={self.include_path}") if self.pre_include is not None: - if is_sequence(self.pre_include): + if is_list_or_tuple(self.pre_include): for header in self.pre_include: self._formatted_options.append(f"--pre-include={header}") else: @@ -329,19 +330,19 @@ def __post_init__(self): if self.no_display_error_number is not None and self.no_display_error_number: self._formatted_options.append("--no-display-error-number") if self.diag_error is not None: - if is_sequence(self.diag_error): + if is_list_or_tuple(self.diag_error): for error in self.diag_error: self._formatted_options.append(f"--diag-error={error}") else: self._formatted_options.append(f"--diag-error={self.diag_error}") if self.diag_suppress is not None: - if is_sequence(self.diag_suppress): + if is_list_or_tuple(self.diag_suppress): for suppress in self.diag_suppress: self._formatted_options.append(f"--diag-suppress={suppress}") else: self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") if self.diag_warn is not None: - if is_sequence(self.diag_warn): + if is_list_or_tuple(self.diag_warn): for warn in self.diag_warn: self._formatted_options.append(f"--diag-warn={warn}") else: diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index 8cb6abaf..9e64311b 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import functools -from collections import Sequence, namedtuple +from collections import namedtuple from typing import Callable, Dict from cuda import cuda, cudart, nvrtc @@ -143,15 +143,15 @@ def get_device_from_ctx(ctx_handle) -> int: return device_id -def is_sequence(obj): +def is_list_or_tuple(obj): """ Check if the given object is a sequence (list or tuple). """ - return isinstance(obj, Sequence) + return isinstance(obj, (list, tuple)) -def is_nested_sequence(obj): +def is_nested_list_or_tuple(obj): """ Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element). """ - return is_sequence(obj) and any(is_sequence(elem) for elem in obj) + return is_list_or_tuple(obj) and any(is_list_or_tuple(elem) for elem in obj) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index bbfc4079..0fbd5866 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -35,7 +35,8 @@ def test_program_with_various_options(init_cuda): ProgramOptions(restrict=True, device_as_default_execution_space=True), ProgramOptions(device_int128=True, optimization_info="inline"), ProgramOptions(no_display_error_number=True), - ProgramOptions(diag_error="1234", diag_suppress="1234"), + ProgramOptions(diag_error=1234, diag_suppress=1234), + ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)), ProgramOptions(diag_warn="1000"), ] From e4786b2d34aa0dc0353728936f12be7fb068fe19 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Wed, 8 Jan 2025 11:34:14 -0800 Subject: [PATCH 36/44] Update cuda_core/cuda/core/experimental/_program.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_program.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index bd5c18aa..6dd807b1 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -25,12 +25,9 @@ class ProgramOptions: Attributes ---------- arch : str, optional - Specify the name of the class of GPU architectures for which the input must be compiled. - Valid values: compute_50, compute_52, compute_53, compute_60, compute_61, compute_62, compute_70, compute_72, - compute_75, compute_80, compute_87, compute_89, compute_90, compute_90a, sm_50, sm_52, sm_53, sm_60, sm_61, - sm_62, sm_70, sm_72, sm_75, sm_80, sm_87, sm_89, sm_90, sm_90a. - Default: compute_52 - Maps to: --gpu-architecture= (-arch) + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. relocatable_device_code : bool, optional Enable (disable) the generation of relocatable device code. Default: False From d913e0a3dba3413538dc450a5938bf9a3f0507ba Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 8 Jan 2025 11:37:25 -0800 Subject: [PATCH 37/44] fix quotes --- cuda_core/cuda/core/experimental/_program.py | 78 ++++++++++---------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 6dd807b1..7646df9b 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -20,9 +20,7 @@ @dataclass class ProgramOptions: - """Customizable options for configuring `Program`. - - Attributes + """Attributes ---------- arch : str, optional Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or @@ -31,157 +29,157 @@ class ProgramOptions: relocatable_device_code : bool, optional Enable (disable) the generation of relocatable device code. Default: False - Maps to: --relocatable-device-code={true|false} (-rdc) + Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``) extensible_whole_program : bool, optional Do extensible whole program compilation of device code. Default: False - Maps to: --extensible-whole-program (-ewp) + Maps to: ``--extensible-whole-program`` (``-ewp``) debug : bool, optional Generate debug information. If --dopt is not specified, then turns off all optimizations. Default: False - Maps to: --device-debug (-G) + Maps to: ``--device-debug`` (``-G``) lineinfo: bool, optional Generate line-number information. Default: False - Maps to: --generate-line-info (-lineinfo) + Maps to: ``--generate-line-info`` (``-lineinfo``) device_code_optimize : bool, optional Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation for optimized device code. Default: None - Maps to: --dopt on (-dopt) + Maps to: ``--dopt on`` (``-dopt``) ptxas_options : Union[str, List[str]], optional Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. For example ["-v", "-O2"]. Default: None - Maps to: --ptxas-options (-Xptxas) + Maps to: ``--ptxas-options `` (``-Xptxas``) max_register_count : int, optional Specify the maximum amount of registers that GPU functions can use. Default: None - Maps to: --maxrregcount= (-maxrregcount) + Maps to: ``--maxrregcount=`` (``-maxrregcount``) ftz : bool, optional When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal values. Default: False - Maps to: --ftz={true|false} (-ftz) + Maps to: ``--ftz={true|false}`` (``-ftz``) prec_sqrt : bool, optional For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. Default: True - Maps to: --prec-sqrt={true|false} (-prec-sqrt) + Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``) prec_div : bool, optional For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster approximation. Default: True - Maps to: --prec-div={true|false} (-prec-div) + Maps to: ``--prec-div={true|false}`` (``-prec-div``) fma : bool, optional Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations. Default: True - Maps to: --fmad={true|false} (-fmad) + Maps to: ``--fmad={true|false}`` (``-fmad``) use_fast_math : bool, optional Make use of fast math operations. Default: False - Maps to: --use_fast_math (-use_fast_math) + Maps to: ``--use_fast_math`` (``-use_fast_math``) extra_device_vectorization : bool, optional Enables more aggressive device code vectorization in the NVVM optimizer. Default: False - Maps to: --extra-device-vectorization (-extra-device-vectorization) + Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``) link_time_optimization : bool, optional Generate intermediate code for later link-time optimization. Default: False - Maps to: --dlink-time-opt (-dlto) + Maps to: ``--dlink-time-opt`` (``-dlto``) gen_opt_lto : bool, optional Run the optimizer passes before generating the LTO IR. Default: False - Maps to: --gen-opt-lto (-gen-opt-lto) + Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``) define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of strings, in which case the first element is defined as the second, or a list of strings or tuples. Default: None - Maps to: --define-macro= (-D) + Maps to: ``--define-macro=`` (``-D``) undefine_macro : Union[str, List[str]], optional Cancel any previous definition of a macro, or list of macros. Default: None - Maps to: --undefine-macro= (-U) + Maps to: ``--undefine-macro=`` (``-U``) include_path : Union[str, List[str]], optional Add the directory or directories to the list of directories to be searched for headers. Default: None - Maps to: --include-path= (-I) + Maps to: ``--include-path=`` (``-I``) pre_include : Union[str, List[str]], optional Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. Default: None - Maps to: --pre-include=
(-include) + Maps to: ``--pre-include=
`` (``-include``) no_source_include : bool, optional Disable the default behavior of adding the directory of each input source to the include path. Default: False - Maps to: --no-source-include (-no-source-include) + Maps to: ``--no-source-include`` (``-no-source-include``) std : str, optional Set language dialect to C++03, C++11, C++14, C++17 or C++20. Default: c++17 - Maps to: --std={c++03|c++11|c++14|c++17|c++20} (-std) + Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``) builtin_move_forward : bool, optional Provide builtin definitions of std::move and std::forward. Default: True - Maps to: --builtin-move-forward={true|false} (-builtin-move-forward) + Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``) builtin_initializer_list : bool, optional Provide builtin definitions of std::initializer_list class and member functions. Default: True - Maps to: --builtin-initializer-list={true|false} (-builtin-initializer-list) + Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``) disable_warnings : bool, optional Inhibit all warning messages. Default: False - Maps to: --disable-warnings (-w) + Maps to: ``--disable-warnings`` (``-w``) restrict : bool, optional Programmer assertion that all kernel pointer parameters are restrict pointers. Default: False - Maps to: --restrict (-restrict) + Maps to: ``--restrict`` (``-restrict``) device_as_default_execution_space : bool, optional Treat entities with no execution space annotation as __device__ entities. Default: False - Maps to: --device-as-default-execution-space (-default-device) + Maps to: ``--device-as-default-execution-space`` (``-default-device``) device_int128 : bool, optional Allow the __int128 type in device code. Default: False - Maps to: --device-int128 (-device-int128) + Maps to: ``--device-int128`` (``-device-int128``) optimization_info : str, optional Provide optimization reports for the specified kind of optimization. Default: None - Maps to: --optimization-info= (-opt-info) + Maps to: ``--optimization-info=`` (``-opt-info``) no_display_error_number : bool, optional Disable the display of a diagnostic number for warning messages. Default: False - Maps to: --no-display-error-number (-no-err-no) + Maps to: ``--no-display-error-number`` (``-no-err-no``) diag_error : Union[int, List[int]], optional Emit error for a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: --diag-error=, ... (-diag-error) + Maps to: ``--diag-error=, ...`` (``-diag-error``) diag_suppress : Union[int, List[int]], optional Suppress a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: --diag-suppress=,… (-diag-suppress) + Maps to: ``--diag-suppress=,…`` (``-diag-suppress``) diag_warn : Union[int, List[int]], optional Emit warning for a specified diagnostic message number or comma separated lis of numbers. Default: None - Maps to: --diag-warn=,… (-diag-warn) + Maps to: ``--diag-warn=,…`` (``-diag-warn``) brief_diagnostics : bool, optional Disable or enable showing source line and column info in a diagnostic. Default: False - Maps to: --brief-diagnostics={true|false} (-brief-diag) + Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``) time : str, optional Generate a CSV table with the time taken by each compilation phase. Default: None - Maps to: --time= (-time) + Maps to: ``--time=`` (``-time``) split_compile : int, optional Perform compiler optimizations in parallel. Default: 1 - Maps to: --split-compile= (-split-compile) + Maps to: ``--split-compile= `` (``-split-compile``) fdevice_syntax_only : bool, optional Ends device compilation after front-end syntax checking. Default: False - Maps to: --fdevice-syntax-only (-fdevice-syntax-only) + Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``) minimal : bool, optional Omit certain language features to reduce compile time for small programs. Default: False - Maps to: --minimal (-minimal) + Maps to: ``--minimal`` (``-minimal``) """ arch: Optional[str] = None From 06c68b45ca7beb178c51e50199b3121a4904a9d8 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 8 Jan 2025 11:38:02 -0800 Subject: [PATCH 38/44] fix quotes --- cuda_core/cuda/core/experimental/_program.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 7646df9b..789ef539 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -20,7 +20,9 @@ @dataclass class ProgramOptions: - """Attributes + """Customizable options for configuring `Program`. + + Attributes ---------- arch : str, optional Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or From 3a747f68bd1b49d3ea36986ed4bd561400243d9d Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 8 Jan 2025 11:40:28 -0800 Subject: [PATCH 39/44] swap api order --- cuda_core/docs/source/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 191760ae..5239174f 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -35,8 +35,8 @@ CUDA compilation toolchain :template: dataclass.rst - LinkerOptions ProgramOptions + LinkerOptions CUDA system information From 9ca2f409d673d300574b99ccf8f3c9e568f0bbe3 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 13 Jan 2025 14:26:03 -0800 Subject: [PATCH 40/44] switch the order fo comparisons to use sequence instead of list / tuple --- cuda_core/cuda/core/experimental/_program.py | 53 ++++++++++---------- cuda_core/cuda/core/experimental/_utils.py | 9 ++-- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 789ef539..85c46df5 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -13,8 +13,8 @@ _handle_boolean_option, check_or_create_options, handle_return, - is_list_or_tuple, - is_nested_list_or_tuple, + is_nested_sequence, + is_sequence, ) @@ -247,11 +247,11 @@ def __post_init__(self): self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}") if self.ptxas_options is not None: self._formatted_options.append("--ptxas-options") - if is_list_or_tuple(self.ptxas_options): + if isinstance(self.ptxas_options, str): + self._formatted_options.append(self.ptxas_options) + elif is_sequence(self.ptxas_options): for option in self.ptxas_options: self._formatted_options.append(option) - else: - self._formatted_options.append(self.ptxas_options) if self.max_register_count is not None: self._formatted_options.append(f"--maxrregcount={self.max_register_count}") if self.ftz is not None: @@ -271,7 +271,9 @@ def __post_init__(self): if self.gen_opt_lto is not None and self.gen_opt_lto: self._formatted_options.append("--gen-opt-lto") if self.define_macro is not None: - if is_nested_list_or_tuple(self.define_macro): + if isinstance(self.define_macro, str): + self._formatted_options.append(f"--define-macro={self.define_macro}") + if is_nested_sequence(self.define_macro): for macro in self.define_macro: if isinstance(macro, tuple): assert len(macro) == 2 @@ -281,27 +283,26 @@ def __post_init__(self): elif isinstance(self.define_macro, tuple): assert len(self.define_macro) == 2 self._formatted_options.append("--define-macro=MY_MACRO=999") - else: - self._formatted_options.append(f"--define-macro={self.define_macro}") if self.undefine_macro is not None: - if is_list_or_tuple(self.undefine_macro): + if isinstance(self.undefine_macro, str): + self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") + elif is_sequence(self.undefine_macro): for macro in self.undefine_macro: self._formatted_options.append(f"--undefine-macro={macro}") - else: - self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") if self.include_path is not None: - if is_list_or_tuple(self.include_path): + if isinstance(self.include_path, str): + self._formatted_options.append(f"--include-path={self.include_path}") + elif is_sequence(self.include_path): for path in self.include_path: self._formatted_options.append(f"--include-path={path}") - else: - self._formatted_options.append(f"--include-path={self.include_path}") if self.pre_include is not None: - if is_list_or_tuple(self.pre_include): + if isinstance(self.pre_include, str): + self._formatted_options.append(f"--pre-include={self.pre_include}") + elif is_sequence(self.pre_include): for header in self.pre_include: self._formatted_options.append(f"--pre-include={header}") - else: - self._formatted_options.append(f"--pre-include={self.pre_include}") + if self.no_source_include is not None and self.no_source_include: self._formatted_options.append("--no-source-include") if self.std is not None: @@ -327,23 +328,23 @@ def __post_init__(self): if self.no_display_error_number is not None and self.no_display_error_number: self._formatted_options.append("--no-display-error-number") if self.diag_error is not None: - if is_list_or_tuple(self.diag_error): + if isinstance(self.diag_error, int): + self._formatted_options.append(f"--diag-error={self.diag_error}") + elif is_sequence(self.diag_error): for error in self.diag_error: self._formatted_options.append(f"--diag-error={error}") - else: - self._formatted_options.append(f"--diag-error={self.diag_error}") if self.diag_suppress is not None: - if is_list_or_tuple(self.diag_suppress): + if isinstance(self.diag_suppress, int): + self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") + elif is_sequence(self.diag_suppress): for suppress in self.diag_suppress: self._formatted_options.append(f"--diag-suppress={suppress}") - else: - self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") if self.diag_warn is not None: - if is_list_or_tuple(self.diag_warn): + if isinstance(self.diag_warn, int): + self._formatted_options.append(f"--diag-warn={self.diag_warn}") + elif is_sequence(self.diag_warn): for warn in self.diag_warn: self._formatted_options.append(f"--diag-warn={warn}") - else: - self._formatted_options.append(f"--diag-warn={self.diag_warn}") if self.brief_diagnostics is not None: self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}") if self.time is not None: diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index 9e64311b..f0bac9d4 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -4,6 +4,7 @@ import functools from collections import namedtuple +from collections.abc import Sequence from typing import Callable, Dict from cuda import cuda, cudart, nvrtc @@ -143,15 +144,15 @@ def get_device_from_ctx(ctx_handle) -> int: return device_id -def is_list_or_tuple(obj): +def is_sequence(obj): """ Check if the given object is a sequence (list or tuple). """ - return isinstance(obj, (list, tuple)) + return isinstance(obj, Sequence) -def is_nested_list_or_tuple(obj): +def is_nested_sequence(obj): """ Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element). """ - return is_list_or_tuple(obj) and any(is_list_or_tuple(elem) for elem in obj) + return is_sequence(obj) and any(is_sequence(elem) for elem in obj) From a710b454e6d4fbee583b1592df2d57f6df915b2b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 13 Jan 2025 14:30:56 -0800 Subject: [PATCH 41/44] fix order and debug code --- cuda_core/cuda/core/experimental/_program.py | 8 ++++---- cuda_core/tests/test_program.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 85c46df5..2b8889c4 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -273,16 +273,16 @@ def __post_init__(self): if self.define_macro is not None: if isinstance(self.define_macro, str): self._formatted_options.append(f"--define-macro={self.define_macro}") - if is_nested_sequence(self.define_macro): + elif isinstance(self.define_macro, tuple): + assert len(self.define_macro) == 2 + self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}") + elif is_nested_sequence(self.define_macro): for macro in self.define_macro: if isinstance(macro, tuple): assert len(macro) == 2 self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}") else: self._formatted_options.append(f"--define-macro={macro}") - elif isinstance(self.define_macro, tuple): - assert len(self.define_macro) == 2 - self._formatted_options.append("--define-macro=MY_MACRO=999") if self.undefine_macro is not None: if isinstance(self.undefine_macro, str): diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 0fbd5866..0f9b8e3b 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -37,7 +37,7 @@ def test_program_with_various_options(init_cuda): ProgramOptions(no_display_error_number=True), ProgramOptions(diag_error=1234, diag_suppress=1234), ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)), - ProgramOptions(diag_warn="1000"), + ProgramOptions(diag_warn=1000), ] for options in options_list: From f4b0f0a96b2d5940477f07979a62e43c4ad2afa0 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 15 Jan 2025 15:04:36 -0800 Subject: [PATCH 42/44] fix ruff --- cuda_core/cuda/core/experimental/__init__.py | 2 +- cuda_core/cuda/core/experimental/_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 66232928..3db9e8ab 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -6,8 +6,8 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch -from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions from cuda.core.experimental._system import System diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index b0254e31..70d7aae4 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -157,8 +157,8 @@ def is_nested_sequence(obj): Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element). """ return is_sequence(obj) and any(is_sequence(elem) for elem in obj) - - + + def get_binding_version(): try: major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2] From 459cbdbca4f9144fdd8feaebd43dd7f49db361d2 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 17 Jan 2025 12:38:46 -0800 Subject: [PATCH 43/44] lint the merge --- cuda_core/cuda/core/experimental/_linker.py | 1 - cuda_core/cuda/core/experimental/_program.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 6ad8220c..b5a6b675 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -9,7 +9,6 @@ from dataclasses import dataclass from typing import List, Optional - from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._utils import check_or_create_options, driver, handle_return diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index aa7d9ead..4f094118 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -6,7 +6,6 @@ from dataclasses import dataclass from typing import List, Optional, Tuple, Union - from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._utils import ( @@ -15,7 +14,7 @@ handle_return, is_nested_sequence, is_sequence, - nvrtc + nvrtc, ) From 9ef482157aa09359429a15b3d06f1b7a585ea915 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 19 Jan 2025 04:45:19 +0000 Subject: [PATCH 44/44] nits --- cuda_core/cuda/core/experimental/_program.py | 6 +++--- cuda_core/examples/vector_add.py | 3 ++- cuda_core/tests/test_linker.py | 12 ++++-------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 4f094118..c10fd077 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE @@ -403,6 +403,8 @@ def close(self): def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff = Program._MembersNeededForFinalize(self, None) + self._options = options = check_or_create_options(ProgramOptions, options, "Program options") + if code_type not in self._supported_code_type: raise NotImplementedError @@ -416,8 +418,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None): else: raise NotImplementedError - self._options = options = check_or_create_options(ProgramOptions, options, "Program options") - def close(self): """Destroy this program.""" self._mnff.close() diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 5a42ab7b..ec398209 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -26,7 +26,8 @@ s = dev.create_stream() # prepare program -program_options = ProgramOptions(std="c++17", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability)) +arch = "".join(f"{i}" for i in dev.compute_capability) +program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") prog = Program(code, code_type="c++", options=program_options) mod = prog.compile("cubin", name_expressions=("vector_add",)) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 37424d7b..b81c1654 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -86,14 +86,10 @@ def test_linker_init(compile_ptx_functions, options): def test_linker_init_invalid_arch(compile_ptx_functions): - if culink_backend: - with pytest.raises(AttributeError): - options = LinkerOptions(arch="99", ptx=True) - Linker(*compile_ptx_functions, options=options) - else: - with pytest.raises(nvjitlink.nvJitLinkError): - options = LinkerOptions(arch="99", ptx=True) - Linker(*compile_ptx_functions, options=options) + err = AttributeError if culink_backend else nvjitlink.nvJitLinkError + with pytest.raises(err): + options = LinkerOptions(arch="99", ptx=True) + Linker(*compile_ptx_functions, options=options) @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")