From 6d789cb19c0b8d8279fb72e9328b466ef0640dec Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 11:20:34 -0800
Subject: [PATCH 01/44] commit squash: add ProgramOptions to Program

---
 cuda_core/cuda/core/experimental/__init__.py |   2 +-
 cuda_core/cuda/core/experimental/_program.py | 371 ++++++++++++++++++-
 cuda_core/cuda/core/experimental/_stream.py  |   4 +-
 cuda_core/cuda/core/experimental/_utils.py   |   7 +
 cuda_core/docs/source/release/0.2.0-notes.md |  12 +
 cuda_core/examples/saxpy.py                  |   9 +-
 cuda_core/examples/vector_add.py             |  14 +-
 cuda_core/tests/test_program.py              |  39 +-
 8 files changed, 425 insertions(+), 33 deletions(-)
 create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 9b978398..c48f2c2c 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -5,5 +5,5 @@
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
-from cuda.core.experimental._program import Program
+from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 6cf13c83..b8442f83 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -2,9 +2,353 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
 from cuda import nvrtc
+from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils import _handle_boolean_option, handle_return
+
+
+@dataclass
+class ProgramOptions:
+    """Customizable :obj:`ProgramOptions` for NVRTC.
+
+    Attributes
+    ----------
+    gpu_architecture : str, optional
+        Specify the name of the class of GPU architectures for which the input must be compiled.
+        Valid values: compute_50, compute_52, compute_53, compute_60, compute_61, compute_62, compute_70, compute_72,
+        compute_75, compute_80, compute_87, compute_89, compute_90, compute_90a, sm_50, sm_52, sm_53, sm_60, sm_61,
+        sm_62, sm_70, sm_72, sm_75, sm_80, sm_87, sm_89, sm_90, sm_90a.
+        Default: compute_52
+        Maps to: --gpu-architecture=<arch> (-arch)
+    relocatable_device_code : bool, optional
+        Enable (disable) the generation of relocatable device code.
+        Default: False
+        Maps to: --relocatable-device-code={true|false} (-rdc)
+    extensible_whole_program : bool, optional
+        Do extensible whole program compilation of device code.
+        Default: False
+        Maps to: --extensible-whole-program (-ewp)
+    device_debug : bool, optional
+        Generate debug information. If --dopt is not specified, then turns off all optimizations.
+        Default: False
+        Maps to: --device-debug (-G)
+    generate_line_info : bool, optional
+        Generate line-number information.
+        Default: False
+        Maps to: --generate-line-info (-lineinfo)
+    device_optimize : bool, optional
+        Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
+        for optimized device code.
+        Default: None
+        Maps to: --dopt on (-dopt)
+    ptxas_options : Union[str, List[str]], optional
+        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
+        For example ["-v", "-O2"].
+        Default: None
+        Maps to: --ptxas-options <options> (-Xptxas)
+    maxrregcount : int, optional
+        Specify the maximum amount of registers that GPU functions can use.
+        Default: None
+        Maps to: --maxrregcount=<N> (-maxrregcount)
+    ftz : bool, optional
+        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
+        values.
+        Default: False
+        Maps to: --ftz={true|false} (-ftz)
+    prec_sqrt : bool, optional
+        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
+        Default: True
+        Maps to: --prec-sqrt={true|false} (-prec-sqrt)
+    prec_div : bool, optional
+        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
+        approximation.
+        Default: True
+        Maps to: --prec-div={true|false} (-prec-div)
+    fmad : bool, optional
+        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
+        multiply-add operations.
+        Default: True
+        Maps to: --fmad={true|false} (-fmad)
+    use_fast_math : bool, optional
+        Make use of fast math operations.
+        Default: False
+        Maps to: --use_fast_math (-use_fast_math)
+    extra_device_vectorization : bool, optional
+        Enables more aggressive device code vectorization in the NVVM optimizer.
+        Default: False
+        Maps to: --extra-device-vectorization (-extra-device-vectorization)
+    dlink_time_opt : bool, optional
+        Generate intermediate code for later link-time optimization.
+        Default: False
+        Maps to: --dlink-time-opt (-dlto)
+    gen_opt_lto : bool, optional
+        Run the optimizer passes before generating the LTO IR.
+        Default: False
+        Maps to: --gen-opt-lto (-gen-opt-lto)
+    define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional
+        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
+        strings, in which case the first element is defined as the second, or a list of strings or tuples.
+        Default: None
+        Maps to: --define-macro=<def> (-D)
+    undefine_macro : Union[str, List[str]], optional
+        Cancel any previous definition of a macro, or list of macros.
+        Default: None
+        Maps to: --undefine-macro=<def> (-U)
+    include_path : Union[str, List[str]], optional
+        Add the directory or directories to the list of directories to be searched for headers.
+        Default: None
+        Maps to: --include-path=<dir> (-I)
+    pre_include : Union[str, List[str]], optional
+        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
+        Default: None
+        Maps to: --pre-include=<header> (-include)
+    no_source_include : bool, optional
+        Disable the default behavior of adding the directory of each input source to the include path.
+        Default: False
+        Maps to: --no-source-include (-no-source-include)
+    std : str, optional
+        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
+        Default: c++17
+        Maps to: --std={c++03|c++11|c++14|c++17|c++20} (-std)
+    builtin_move_forward : bool, optional
+        Provide builtin definitions of std::move and std::forward.
+        Default: True
+        Maps to: --builtin-move-forward={true|false} (-builtin-move-forward)
+    builtin_initializer_list : bool, optional
+        Provide builtin definitions of std::initializer_list class and member functions.
+        Default: True
+        Maps to: --builtin-initializer-list={true|false} (-builtin-initializer-list)
+    disable_warnings : bool, optional
+        Inhibit all warning messages.
+        Default: False
+        Maps to: --disable-warnings (-w)
+    restrict : bool, optional
+        Programmer assertion that all kernel pointer parameters are restrict pointers.
+        Default: False
+        Maps to: --restrict (-restrict)
+    device_as_default_execution_space : bool, optional
+        Treat entities with no execution space annotation as __device__ entities.
+        Default: False
+        Maps to: --device-as-default-execution-space (-default-device)
+    device_int128 : bool, optional
+        Allow the __int128 type in device code.
+        Default: False
+        Maps to: --device-int128 (-device-int128)
+    optimization_info : str, optional
+        Provide optimization reports for the specified kind of optimization.
+        Default: None
+        Maps to: --optimization-info=<kind> (-opt-info)
+    no_display_error_number : bool, optional
+        Disable the display of a diagnostic number for warning messages.
+        Default: False
+        Maps to: --no-display-error-number (-no-err-no)
+    diag_error : str, optional
+        Emit error for specified diagnostic message number(s).
+        Default: None
+        Maps to: --diag-error=<error-number>,… (-diag-error)
+    diag_suppress : str, optional
+        Suppress specified diagnostic message number(s).
+        Default: None
+        Maps to: --diag-suppress=<error-number>,… (-diag-suppress)
+    diag_warn : str, optional
+        Emit warning for specified diagnostic message number(s).
+        Default: None
+        Maps to: --diag-warn=<error-number>,… (-diag-warn)
+    brief_diagnostics : bool, optional
+        Disable or enable showing source line and column info in a diagnostic.
+        Default: False
+        Maps to: --brief-diagnostics={true|false} (-brief-diag)
+    time : str, optional
+        Generate a CSV table with the time taken by each compilation phase.
+        Default: None
+        Maps to: --time=<file-name> (-time)
+    split_compile : int, optional
+        Perform compiler optimizations in parallel.
+        Default: 1
+        Maps to: --split-compile= <number of threads> (-split-compile)
+    fdevice_syntax_only : bool, optional
+        Ends device compilation after front-end syntax checking.
+        Default: False
+        Maps to: --fdevice-syntax-only (-fdevice-syntax-only)
+    minimal : bool, optional
+        Omit certain language features to reduce compile time for small programs.
+        Default: False
+        Maps to: --minimal (-minimal)
+    """
+
+    gpu_architecture: Optional[str] = None
+    device_c: Optional[bool] = None
+    device_w: Optional[bool] = None
+    relocatable_device_code: Optional[bool] = None
+    extensible_whole_program: Optional[bool] = None
+    device_debug: Optional[bool] = None
+    generate_line_info: Optional[bool] = None
+    device_optimize: Optional[bool] = None
+    ptxas_options: Optional[Union[str, List[str]]] = None
+    maxrregcount: Optional[int] = None
+    ftz: Optional[bool] = None
+    prec_sqrt: Optional[bool] = None
+    prec_div: Optional[bool] = None
+    fmad: Optional[bool] = None
+    use_fast_math: Optional[bool] = None
+    extra_device_vectorization: Optional[bool] = None
+    dlink_time_opt: Optional[bool] = None
+    gen_opt_lto: Optional[bool] = None
+    define_macro: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None
+    undefine_macro: Optional[Union[str, List[str]]] = None
+    include_path: Optional[Union[str, List[str]]] = None
+    pre_include: Optional[Union[str, List[str]]] = None
+    no_source_include: Optional[bool] = None
+    std: Optional[str] = None
+    builtin_move_forward: Optional[bool] = None
+    builtin_initializer_list: Optional[bool] = None
+    disable_warnings: Optional[bool] = None
+    restrict: Optional[bool] = None
+    device_as_default_execution_space: Optional[bool] = None
+    device_int128: Optional[bool] = None
+    optimization_info: Optional[str] = None
+    no_display_error_number: Optional[bool] = None
+    diag_error: Optional[str] = None
+    diag_suppress: Optional[str] = None
+    diag_warn: Optional[str] = None
+    brief_diagnostics: Optional[bool] = None
+    time: Optional[str] = None
+    split_compile: Optional[int] = None
+    fdevice_syntax_only: Optional[bool] = None
+    minimal: Optional[bool] = None
+
+    def __post_init__(self):
+        self._formatted_options = []
+        if self.gpu_architecture is not None:
+            self._formatted_options.append(f"--gpu-architecture={self.gpu_architecture}")
+        else:
+            self._formatted_options.append(
+                "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability)
+            )
+        if self.relocatable_device_code is not None:
+            self._formatted_options.append(
+                f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}"
+            )
+        if self.extensible_whole_program is not None and self.extensible_whole_program:
+            self._formatted_options.append("--extensible-whole-program")
+        if self.device_debug is not None and self.device_debug:
+            self._formatted_options.append("--device-debug")
+        if self.generate_line_info is not None and self.generate_line_info:
+            self._formatted_options.append("--generate-line-info")
+        if self.device_optimize is not None:
+            self._formatted_options.append(f"--dopt={'on' if self.device_optimize else 'off'}")
+        if self.ptxas_options is not None:
+            self._formatted_options.append("--ptxas-options")
+            if isinstance(self.ptxas_options, list):
+                for option in self.ptxas_options:
+                    self._formatted_options.append(option)
+            else:
+                self._formatted_options.append("self.ptxas_options")
+        if self.maxrregcount is not None:
+            self._formatted_options.append(f"--maxrregcount={self.maxrregcount}")
+        if self.ftz is not None:
+            self._formatted_options.append(f"--ftz={_handle_boolean_option(self.ftz)}")
+        if self.prec_sqrt is not None:
+            self._formatted_options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}")
+        if self.prec_div is not None:
+            self._formatted_options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}")
+        if self.fmad is not None:
+            self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fmad)}")
+        if self.use_fast_math is not None and self.use_fast_math:
+            self._formatted_options.append("--use_fast_math")
+        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
+            self._formatted_options.append("--extra-device-vectorization")
+        if self.dlink_time_opt is not None and self.dlink_time_opt:
+            self._formatted_options.append("--dlink-time-opt")
+        if self.gen_opt_lto is not None and self.gen_opt_lto:
+            self._formatted_options.append("--gen-opt-lto")
+        if self.define_macro is not None:
+            if isinstance(self.define_macro, list):
+                for macro in self.define_macro:
+                    if isinstance(macro, tuple):
+                        assert len(macro) == 2
+                        self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
+                    else:
+                        self._formatted_options.append(f"--define-macro={macro}")
+            elif isinstance(self.define_macro, tuple):
+                assert len(self.define_macro) == 2
+                self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}")
+            else:
+                self._formatted_options.append(f"--define-macro={self.define_macro}")
+
+        if self.undefine_macro is not None:
+            if isinstance(self.undefine_macro, list):
+                for macro in self.undefine_macro:
+                    self._formatted_options.append(f"--undefine-macro={macro}")
+            else:
+                self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
+        if self.include_path is not None:
+            if isinstance(self.include_path, list):
+                for path in self.include_path:
+                    self._formatted_options.append(f"--include-path={path}")
+            else:
+                self._formatted_options.append(f"--include-path={self.include_path}")
+        if self.pre_include is not None:
+            if isinstance(self.pre_include, list):
+                for header in self.pre_include:
+                    self._formatted_options.append(f"--pre-include={header}")
+            else:
+                self._formatted_options.append(f"--pre-include={self.pre_include}")
+        if self.no_source_include is not None and self.no_source_include:
+            self._formatted_options.append("--no-source-include")
+        if self.std is not None:
+            self._formatted_options.append(f"--std={self.std}")
+        if self.builtin_move_forward is not None:
+            self._formatted_options.append(
+                f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}"
+            )
+        if self.builtin_initializer_list is not None:
+            self._formatted_options.append(
+                f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}"
+            )
+        if self.disable_warnings is not None and self.disable_warnings:
+            self._formatted_options.append("--disable-warnings")
+        if self.restrict is not None and self.restrict:
+            self._formatted_options.append("--restrict")
+        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
+            self._formatted_options.append("--device-as-default-execution-space")
+        if self.device_int128 is not None and self.device_int128:
+            self._formatted_options.append("--device-int128")
+        if self.optimization_info is not None:
+            self._formatted_options.append(f"--optimization-info={self.optimization_info}")
+        if self.no_display_error_number is not None and self.no_display_error_number:
+            self._formatted_options.append("--no-display-error-number")
+        if self.diag_error is not None:
+            self._formatted_options.append(f"--diag-error={self.diag_error}")
+        if self.diag_suppress is not None:
+            self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
+        if self.diag_warn is not None:
+            self._formatted_options.append(f"--diag-warn={self.diag_warn}")
+        if self.brief_diagnostics is not None:
+            self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
+        if self.time is not None:
+            self._formatted_options.append(f"--time={self.time}")
+        if self.split_compile is not None:
+            self._formatted_options.append(f"--split-compile={self.split_compile}")
+        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
+            self._formatted_options.append("--fdevice-syntax-only")
+        if self.minimal is not None and self.minimal:
+            self._formatted_options.append("--minimal")
+
+    def _as_bytes(self):
+        # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+        result = []
+        for option in self._formatted_options:
+            result.append(option.encode())
+        return result
+
+    def __repr__(self):
+        # __TODO__ improve this
+        return self._formatted_options
 
 
 class Program:
@@ -21,14 +365,16 @@ class Program:
         String of the CUDA Runtime Compilation program.
     code_type : Any
         String of the code type. Currently only ``"c++"`` is supported.
-
+    options : ProgramOptions, optional
+        A ProgramOptions object to customize the compilation process.
+        See :obj:`ProgramOptions` for more information.
     """
 
-    __slots__ = ("_handle", "_backend")
+    __slots__ = ("_handle", "_backend", "_options")
     _supported_code_type = ("c++",)
     _supported_target_type = ("ptx", "cubin", "ltoir")
 
-    def __init__(self, code, code_type):
+    def __init__(self, code, code_type, options: ProgramOptions = None):
         self._handle = None
         if code_type not in self._supported_code_type:
             raise NotImplementedError
@@ -37,12 +383,16 @@ def __init__(self, code, code_type):
             if not isinstance(code, str):
                 raise TypeError
             # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
         else:
             raise NotImplementedError
 
+        if options is None:
+            self._options = []
+        else:
+            self._options = options._as_bytes()
+
     def __del__(self):
         """Return close(self)."""
         self.close()
@@ -53,7 +403,7 @@ def close(self):
             handle_return(nvrtc.nvrtcDestroyProgram(self._handle))
             self._handle = None
 
-    def compile(self, target_type, options=(), name_expressions=(), logs=None):
+    def compile(self, target_type, name_expressions=(), logs=None):
         """Compile the program with a specific compilation type.
 
         Parameters
@@ -61,9 +411,6 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
         target_type : Any
             String of the targeted compilation type.
             Supported options are "ptx", "cubin" and "ltoir".
-        options : Union[List, Tuple], optional
-            List of compilation options associated with the backend
-            of this :obj:`Program`. (Default to no options)
         name_expressions : Union[List, Tuple], optional
             List of explicit name expressions to become accessible.
             (Default to no expressions)
@@ -85,9 +432,9 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
             if name_expressions:
                 for n in name_expressions:
                     handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle)
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            options = list(o.encode() for o in options)
-            handle_return(nvrtc.nvrtcCompileProgram(self._handle, len(options), options), handle=self._handle)
+            handle_return(
+                nvrtc.nvrtcCompileProgram(self._handle, len(self._options), self._options), handle=self._handle
+            )
 
             size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
             comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 6a68d175..5756dd34 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -211,9 +211,7 @@ def wait(self, event_or_stream: Union[Event, Stream]):
                 try:
                     stream = Stream._init(event_or_stream)
                 except Exception as e:
-                    raise ValueError(
-                        "only an Event, Stream, or object supporting __cuda_stream__ can be waited"
-                    ) from e
+                    raise ValueError("only an Event, Stream, or object supporting __cuda_stream__ can be waited") from e
             else:
                 stream = event_or_stream
             event = handle_return(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index 9cb47a33..74488a53 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -87,6 +87,13 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
     return options
 
 
+def _handle_boolean_option(option: bool) -> str:
+    """
+    Convert a boolean option to a string representation.
+    """
+    return str(option).lower()
+
+
 def precondition(checker: Callable[..., None], what: str = "") -> Callable:
     """
     A decorator that adds checks to ensure any preconditions are met.
diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
new file mode 100644
index 00000000..914e9c17
--- /dev/null
+++ b/cuda_core/docs/source/release/0.2.0-notes.md
@@ -0,0 +1,12 @@
+# `cuda.core` Release notes
+
+Released on <TODO>, 2024
+
+## Hightlights
+- Add ProgramOptions to facilitate the passing of runtime compile options to [Program](#program)
+
+## Limitations
+-<TODO>
+
+## Breaking Changes
+- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`.
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index 8caa4d4a..c6b4be85 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -6,7 +6,7 @@
 
 import cupy as cp
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute out = a * x + y
 code = """
@@ -29,13 +29,10 @@
 s = dev.create_stream()
 
 # prepare program
-prog = Program(code, code_type="c++")
+program_options = ProgramOptions(std="c++11", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile(
     "cubin",
-    options=(
-        "-std=c++11",
-        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
-    ),
     logs=sys.stdout,
     name_expressions=("saxpy<float>", "saxpy<double>"),
 )
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 550eaf2a..60b9b0db 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -4,7 +4,7 @@
 
 import cupy as cp
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute c = a + b
 code = """
@@ -26,15 +26,9 @@
 s = dev.create_stream()
 
 # prepare program
-prog = Program(code, code_type="c++")
-mod = prog.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
-    ),
-    name_expressions=("vector_add<float>",),
-)
+program_options = ProgramOptions(std="c++17", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+prog = Program(code, code_type="c++", options=program_options)
+mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 # run in single precision
 ker = mod.get_kernel("vector_add<float>")
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index af94a7ba..e6ae3334 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -8,8 +8,45 @@
 
 import pytest
 
-from cuda.core.experimental import Program
 from cuda.core.experimental._module import Kernel, ObjectCode
+from cuda.core.experimental._program import Program, ProgramOptions
+
+
+def test_program_with_various_options(init_cuda):
+    code = 'extern "C" __global__ void my_kernel() {}'
+
+    options_list = [
+        ProgramOptions(ptxas_options="-v"),
+        ProgramOptions(ptxas_options=["-v", "-O3"]),
+        ProgramOptions(device_optimize=True, device_debug=True),
+        ProgramOptions(relocatable_device_code=True, maxrregcount=32),
+        ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
+        ProgramOptions(fmad=False, use_fast_math=True),
+        ProgramOptions(extra_device_vectorization=True),
+        ProgramOptions(dlink_time_opt=True, gen_opt_lto=True),
+        ProgramOptions(define_macro="MY_MACRO"),
+        ProgramOptions(define_macro=("MY_MACRO", "99")),
+        ProgramOptions(define_macro=[("MY_MACRO", "99")]),
+        ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]),
+        ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]),
+        ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"),
+        ProgramOptions(pre_include="my_header.h", no_source_include=True),
+        ProgramOptions(builtin_initializer_list=False, disable_warnings=True),
+        ProgramOptions(restrict=True, device_as_default_execution_space=True),
+        ProgramOptions(device_int128=True, optimization_info="inline"),
+        ProgramOptions(no_display_error_number=True),
+        ProgramOptions(diag_error="1234", diag_suppress="5678"),
+        ProgramOptions(diag_warn="91011", brief_diagnostics=True),
+        ProgramOptions(time="compile_time.csv", split_compile=4),
+        ProgramOptions(fdevice_syntax_only=True, minimal=True),
+    ]
+
+    # TODO compile the program once the CI is set up
+    for options in options_list:
+        program = Program(code, "c++", options)
+        assert program.backend == "nvrtc"
+        program.close()
+        assert program.handle is None
 
 
 def test_program_init_valid_code_type():

From 66ceb8508100fbbbe2b7ab3c547e895127a6c335 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 11:29:32 -0800
Subject: [PATCH 02/44] remove stream from commit

---
 cuda_core/cuda/core/experimental/_stream.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 5756dd34..6a68d175 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -211,7 +211,9 @@ def wait(self, event_or_stream: Union[Event, Stream]):
                 try:
                     stream = Stream._init(event_or_stream)
                 except Exception as e:
-                    raise ValueError("only an Event, Stream, or object supporting __cuda_stream__ can be waited") from e
+                    raise ValueError(
+                        "only an Event, Stream, or object supporting __cuda_stream__ can be waited"
+                    ) from e
             else:
                 stream = event_or_stream
             event = handle_return(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))

From 37a945c92b99ffd0d2311ea9fc8c9c6b0f947972 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 11:42:54 -0800
Subject: [PATCH 03/44] modify doc source

---
 cuda_core/docs/source/api.rst    | 4 ++++
 cuda_core/docs/source/release.md | 1 +
 2 files changed, 5 insertions(+)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 1cb9811b..32a4ac2d 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -31,3 +31,7 @@ CUDA compilation toolchain
    :toctree: generated/
 
    Program
+
+   :template: dataclass.rst
+
+   ProgramOptions
\ No newline at end of file
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 48e24786..4c615eb3 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -6,4 +6,5 @@ maxdepth: 3
 ---
 
     0.1.0 <release/0.1.0-notes>
+    0.2.0 <release/0.2.0-notes>
 ```

From 3555e2eed747b2f402219b9a74d2b8fa09ccfc03 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 11:44:15 -0800
Subject: [PATCH 04/44] modify doc source

---
 cuda_core/docs/source/release/0.2.0-notes.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
index 914e9c17..3ba043ce 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.md
+++ b/cuda_core/docs/source/release/0.2.0-notes.md
@@ -3,7 +3,7 @@
 Released on <TODO>, 2024
 
 ## Hightlights
-- Add ProgramOptions to facilitate the passing of runtime compile options to [Program](#program)
+- Add ProgramOptions to facilitate the passing of runtime compile options to `Program`
 
 ## Limitations
 -<TODO>

From a9ac448e4ac6bab862498eb35763bbc9ddf2e1d6 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 18 Dec 2024 07:40:29 -0800
Subject: [PATCH 05/44] integrate program options into the tests

---
 cuda_core/cuda/core/experimental/_program.py |   7 +-
 cuda_core/examples/strided_memory_view.py    |  11 +-
 cuda_core/examples/thread_block_cluster.py   |  12 +-
 cuda_core/tests/test_linker.py               |  14 +-
 cuda_core/tests/test_program.py              | 209 ++++++++++---------
 5 files changed, 127 insertions(+), 126 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 5db6f6ba..feb282df 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import weakref
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
-import weakref
 
 from cuda import nvrtc
 from cuda.core.experimental._device import Device
@@ -440,9 +440,10 @@ def compile(self, target_type, name_expressions=(), logs=None):
         if self._backend == "nvrtc":
             if name_expressions:
                 for n in name_expressions:
-                    handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle)
+                    handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle)
             handle_return(
-                nvrtc.nvrtcCompileProgram(self._handle, len(self._options), self._options), handle=self._handle
+                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options),
+                handle=self._mnff.handle,
             )
 
             size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 564d7fa0..33ae26e1 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -31,7 +31,7 @@
     cp = None
 import numpy as np
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
@@ -88,15 +88,12 @@
         }
     }
     """).substitute(func_sig=func_sig)
-    gpu_prog = Program(gpu_code, code_type="c++")
+
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    mod = gpu_prog.compile(
-        target_type="cubin",
-        # TODO: update this after NVIDIA/cuda-python#237 is merged
-        options=(f"-arch=sm_{arch}", "-std=c++11"),
-    )
+    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++11"))
+    mod = gpu_prog.compile(target_type="cubin")
     gpu_ker = mod.get_kernel(func_name)
 
 # Now we are prepared to run the code from the user's perspective!
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index fa70738d..7b0182cd 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -5,7 +5,7 @@
 import os
 import sys
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # prepare include
 cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
@@ -44,12 +44,12 @@
 
 # prepare program & compile kernel
 dev.set_current()
-prog = Program(code, code_type="c++")
-mod = prog.compile(
-    target_type="cubin",
-    # TODO: update this after NVIDIA/cuda-python#237 is merged
-    options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"),
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++17", include_paths=cuda_include_path),
 )
+mod = prog.compile(target_type="cubin")
 ker = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 54cd8cf4..bc1fb109 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from cuda.core.experimental import Linker, LinkerOptions, Program, _linker
+from cuda.core.experimental import Linker, LinkerOptions, Program, ProgramOptions, _linker
 from cuda.core.experimental._module import ObjectCode
 
 ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
@@ -24,18 +24,18 @@
 def compile_ptx_functions(init_cuda):
     # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced
     # device functions, causing the link to fail
-    object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",))
-    object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",))
-    object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",))
+    object_code_a_ptx = Program(kernel_a, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx")
+    object_code_b_ptx = Program(device_function_b, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx")
+    object_code_c_ptx = Program(device_function_c, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx")
 
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
 
 
 @pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
-    object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",))
-    object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",))
-    object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",))
+    object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir")
+    object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir")
+    object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir")
 
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e6ae3334..5d6376f0 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -1,103 +1,106 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-
-
-def test_program_with_various_options(init_cuda):
-    code = 'extern "C" __global__ void my_kernel() {}'
-
-    options_list = [
-        ProgramOptions(ptxas_options="-v"),
-        ProgramOptions(ptxas_options=["-v", "-O3"]),
-        ProgramOptions(device_optimize=True, device_debug=True),
-        ProgramOptions(relocatable_device_code=True, maxrregcount=32),
-        ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
-        ProgramOptions(fmad=False, use_fast_math=True),
-        ProgramOptions(extra_device_vectorization=True),
-        ProgramOptions(dlink_time_opt=True, gen_opt_lto=True),
-        ProgramOptions(define_macro="MY_MACRO"),
-        ProgramOptions(define_macro=("MY_MACRO", "99")),
-        ProgramOptions(define_macro=[("MY_MACRO", "99")]),
-        ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]),
-        ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]),
-        ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"),
-        ProgramOptions(pre_include="my_header.h", no_source_include=True),
-        ProgramOptions(builtin_initializer_list=False, disable_warnings=True),
-        ProgramOptions(restrict=True, device_as_default_execution_space=True),
-        ProgramOptions(device_int128=True, optimization_info="inline"),
-        ProgramOptions(no_display_error_number=True),
-        ProgramOptions(diag_error="1234", diag_suppress="5678"),
-        ProgramOptions(diag_warn="91011", brief_diagnostics=True),
-        ProgramOptions(time="compile_time.csv", split_compile=4),
-        ProgramOptions(fdevice_syntax_only=True, minimal=True),
-    ]
-
-    # TODO compile the program once the CI is set up
-    for options in options_list:
-        program = Program(code, "c++", options)
-        assert program.backend == "nvrtc"
-        program.close()
-        assert program.handle is None
-
-
-def test_program_init_valid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.backend == "nvrtc"
-    assert program.handle is not None
-
-
-def test_program_init_invalid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    with pytest.raises(NotImplementedError):
-        Program(code, "python")
-
-
-def test_program_init_invalid_code_format():
-    code = 12345
-    with pytest.raises(TypeError):
-        Program(code, "c++")
-
-
-def test_program_compile_valid_target_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    object_code = program.compile("ptx")
-    kernel = object_code.get_kernel("my_kernel")
-    assert isinstance(object_code, ObjectCode)
-    assert isinstance(kernel, Kernel)
-
-
-def test_program_compile_invalid_target_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    with pytest.raises(NotImplementedError):
-        program.compile("invalid_target")
-
-
-def test_program_backend_property():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.backend == "nvrtc"
-
-
-def test_program_handle_property():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.handle is not None
-
-
-def test_program_close():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    program.close()
-    assert program.handle is None
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+from conftest import can_load_generated_ptx
+
+from cuda.core.experimental._module import Kernel, ObjectCode
+from cuda.core.experimental._program import Program, ProgramOptions
+
+
+def test_program_with_various_options(init_cuda):
+    code = 'extern "C" __global__ void my_kernel() {}'
+
+    options_list = [
+        ProgramOptions(ptxas_options="-v"),
+        ProgramOptions(ptxas_options=["-v", "-O3"]),
+        ProgramOptions(device_optimize=True, device_debug=True),
+        ProgramOptions(relocatable_device_code=True, maxrregcount=32),
+        ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
+        ProgramOptions(fmad=False, use_fast_math=True),
+        ProgramOptions(extra_device_vectorization=True),
+        ProgramOptions(dlink_time_opt=True, gen_opt_lto=True),
+        ProgramOptions(define_macro="MY_MACRO"),
+        ProgramOptions(define_macro=("MY_MACRO", "99")),
+        ProgramOptions(define_macro=[("MY_MACRO", "99")]),
+        ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]),
+        ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]),
+        ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"),
+        ProgramOptions(pre_include="my_header.h", no_source_include=True),
+        ProgramOptions(builtin_initializer_list=False, disable_warnings=True),
+        ProgramOptions(restrict=True, device_as_default_execution_space=True),
+        ProgramOptions(device_int128=True, optimization_info="inline"),
+        ProgramOptions(no_display_error_number=True),
+        ProgramOptions(diag_error="1234", diag_suppress="5678"),
+        ProgramOptions(diag_warn="91011", brief_diagnostics=True),
+        ProgramOptions(time="compile_time.csv", split_compile=4),
+        ProgramOptions(fdevice_syntax_only=True, minimal=True),
+    ]
+
+    # TODO compile the program once the CI is set up
+    for options in options_list:
+        program = Program(code, "c++", options)
+        assert program.backend == "nvrtc"
+        program.close()
+        assert program.handle is None
+
+
+def test_program_init_valid_code_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.backend == "nvrtc"
+    assert program.handle is not None
+
+
+def test_program_init_invalid_code_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    with pytest.raises(NotImplementedError):
+        Program(code, "python")
+
+
+def test_program_init_invalid_code_format():
+    code = 12345
+    with pytest.raises(TypeError):
+        Program(code, "c++")
+
+
+# TODO: incorporate this check in Program
+@pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
+def test_program_compile_valid_target_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    object_code = program.compile("ptx")
+    kernel = object_code.get_kernel("my_kernel")
+    assert isinstance(object_code, ObjectCode)
+    assert isinstance(kernel, Kernel)
+
+
+def test_program_compile_invalid_target_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    with pytest.raises(NotImplementedError):
+        program.compile("invalid_target")
+
+
+def test_program_backend_property():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.backend == "nvrtc"
+
+
+def test_program_handle_property():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.handle is not None
+
+
+def test_program_close():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    program.close()
+    assert program.handle is None

From 6790c83b39bbb7702cd032ae20b01491c02733a3 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Sun, 29 Dec 2024 22:04:11 -0800
Subject: [PATCH 06/44] update the attribute names for consistency across
 linker and program

---
 cuda_core/cuda/core/experimental/_program.py | 44 ++++++++++----------
 cuda_core/tests/test_linker.py               | 10 +++--
 cuda_core/tests/test_program.py              |  8 ++--
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index feb282df..08569f33 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -18,7 +18,7 @@ class ProgramOptions:
 
     Attributes
     ----------
-    gpu_architecture : str, optional
+    arch : str, optional
         Specify the name of the class of GPU architectures for which the input must be compiled.
         Valid values: compute_50, compute_52, compute_53, compute_60, compute_61, compute_62, compute_70, compute_72,
         compute_75, compute_80, compute_87, compute_89, compute_90, compute_90a, sm_50, sm_52, sm_53, sm_60, sm_61,
@@ -33,15 +33,15 @@ class ProgramOptions:
         Do extensible whole program compilation of device code.
         Default: False
         Maps to: --extensible-whole-program (-ewp)
-    device_debug : bool, optional
+    debug : bool, optional
         Generate debug information. If --dopt is not specified, then turns off all optimizations.
         Default: False
         Maps to: --device-debug (-G)
-    generate_line_info : bool, optional
+    lineinfo: bool, optional
         Generate line-number information.
         Default: False
         Maps to: --generate-line-info (-lineinfo)
-    device_optimize : bool, optional
+    device_code_optimize : bool, optional
         Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
         for optimized device code.
         Default: None
@@ -51,7 +51,7 @@ class ProgramOptions:
         For example ["-v", "-O2"].
         Default: None
         Maps to: --ptxas-options <options> (-Xptxas)
-    maxrregcount : int, optional
+    max_register_count : int, optional
         Specify the maximum amount of registers that GPU functions can use.
         Default: None
         Maps to: --maxrregcount=<N> (-maxrregcount)
@@ -69,7 +69,7 @@ class ProgramOptions:
         approximation.
         Default: True
         Maps to: --prec-div={true|false} (-prec-div)
-    fmad : bool, optional
+    fma : bool, optional
         Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
         multiply-add operations.
         Default: True
@@ -82,7 +82,7 @@ class ProgramOptions:
         Enables more aggressive device code vectorization in the NVVM optimizer.
         Default: False
         Maps to: --extra-device-vectorization (-extra-device-vectorization)
-    dlink_time_opt : bool, optional
+    link_time_optimization : bool, optional
         Generate intermediate code for later link-time optimization.
         Default: False
         Maps to: --dlink-time-opt (-dlto)
@@ -186,18 +186,18 @@ class ProgramOptions:
     device_w: Optional[bool] = None
     relocatable_device_code: Optional[bool] = None
     extensible_whole_program: Optional[bool] = None
-    device_debug: Optional[bool] = None
-    generate_line_info: Optional[bool] = None
-    device_optimize: Optional[bool] = None
+    debug: Optional[bool] = None
+    lineinfo: Optional[bool] = None
+    device_code_optimize: Optional[bool] = None
     ptxas_options: Optional[Union[str, List[str]]] = None
-    maxrregcount: Optional[int] = None
+    max_register_count: Optional[int] = None
     ftz: Optional[bool] = None
     prec_sqrt: Optional[bool] = None
     prec_div: Optional[bool] = None
-    fmad: Optional[bool] = None
+    fma: Optional[bool] = None
     use_fast_math: Optional[bool] = None
     extra_device_vectorization: Optional[bool] = None
-    dlink_time_opt: Optional[bool] = None
+    link_time_optimization: Optional[bool] = None
     gen_opt_lto: Optional[bool] = None
     define_macro: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None
     undefine_macro: Optional[Union[str, List[str]]] = None
@@ -236,12 +236,12 @@ def __post_init__(self):
             )
         if self.extensible_whole_program is not None and self.extensible_whole_program:
             self._formatted_options.append("--extensible-whole-program")
-        if self.device_debug is not None and self.device_debug:
+        if self.debug is not None and self.debug:
             self._formatted_options.append("--device-debug")
-        if self.generate_line_info is not None and self.generate_line_info:
+        if self.lineinfo is not None and self.lineinfo:
             self._formatted_options.append("--generate-line-info")
-        if self.device_optimize is not None:
-            self._formatted_options.append(f"--dopt={'on' if self.device_optimize else 'off'}")
+        if self.device_code_optimize is not None:
+            self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}")
         if self.ptxas_options is not None:
             self._formatted_options.append("--ptxas-options")
             if isinstance(self.ptxas_options, list):
@@ -249,21 +249,21 @@ def __post_init__(self):
                     self._formatted_options.append(option)
             else:
                 self._formatted_options.append("self.ptxas_options")
-        if self.maxrregcount is not None:
-            self._formatted_options.append(f"--maxrregcount={self.maxrregcount}")
+        if self.max_register_count is not None:
+            self._formatted_options.append(f"--maxrregcount={self.max_register_count}")
         if self.ftz is not None:
             self._formatted_options.append(f"--ftz={_handle_boolean_option(self.ftz)}")
         if self.prec_sqrt is not None:
             self._formatted_options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}")
         if self.prec_div is not None:
             self._formatted_options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}")
-        if self.fmad is not None:
-            self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fmad)}")
+        if self.fma is not None:
+            self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fma)}")
         if self.use_fast_math is not None and self.use_fast_math:
             self._formatted_options.append("--use_fast_math")
         if self.extra_device_vectorization is not None and self.extra_device_vectorization:
             self._formatted_options.append("--extra-device-vectorization")
-        if self.dlink_time_opt is not None and self.dlink_time_opt:
+        if self.link_time_optimization is not None and self.link_time_optimization:
             self._formatted_options.append("--dlink-time-opt")
         if self.gen_opt_lto is not None and self.gen_opt_lto:
             self._formatted_options.append("--gen-opt-lto")
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index bc1fb109..83ce5208 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -33,9 +33,13 @@ def compile_ptx_functions(init_cuda):
 
 @pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
-    object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir")
-    object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir")
-    object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(dlink_time_opt=True)).compile("ltoir")
+    object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(link_time_optimization=True)).compile("ltoir")
+    object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(link_time_optimization=True)).compile(
+        "ltoir"
+    )
+    object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(link_time_optimization=True)).compile(
+        "ltoir"
+    )
 
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 5d6376f0..fd2515b9 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -19,12 +19,12 @@ def test_program_with_various_options(init_cuda):
     options_list = [
         ProgramOptions(ptxas_options="-v"),
         ProgramOptions(ptxas_options=["-v", "-O3"]),
-        ProgramOptions(device_optimize=True, device_debug=True),
-        ProgramOptions(relocatable_device_code=True, maxrregcount=32),
+        ProgramOptions(device_code_optimize=True, debug=True),
+        ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
-        ProgramOptions(fmad=False, use_fast_math=True),
+        ProgramOptions(fma=False, use_fast_math=True),
         ProgramOptions(extra_device_vectorization=True),
-        ProgramOptions(dlink_time_opt=True, gen_opt_lto=True),
+        ProgramOptions(link_time_optimization=True, gen_opt_lto=True),
         ProgramOptions(define_macro="MY_MACRO"),
         ProgramOptions(define_macro=("MY_MACRO", "99")),
         ProgramOptions(define_macro=[("MY_MACRO", "99")]),

From 7d5b894fda1066788ab9ae50486cb59cd1f5d5e5 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 30 Dec 2024 16:11:20 -0800
Subject: [PATCH 07/44] fix module test

---
 cuda_core/tests/test_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 61933d41..7db017f1 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -10,13 +10,13 @@
 import pytest
 from conftest import can_load_generated_ptx
 
-from cuda.core.experimental import Program
+from cuda.core.experimental import Program, ProgramOptions
 
 
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
 def test_get_kernel():
     kernel = """extern "C" __global__ void ABC() { }"""
-    object_code = Program(kernel, "c++").compile("ptx", options=("-rdc=true",))
+    object_code = Program(kernel, "c++", options=ProgramOptions(relocatable_device_code=True)).compile("ptx")
     assert object_code._handle is None
     kernel = object_code.get_kernel("ABC")
     assert object_code._handle is not None

From 1789a849500416020da439d358340e58f2ea3107 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 30 Dec 2024 17:35:59 -0800
Subject: [PATCH 08/44] update the tests

---
 cuda_core/cuda/core/experimental/_linker.py  | 11 +++++----
 cuda_core/cuda/core/experimental/_program.py |  6 ++---
 cuda_core/docs/source/release/0.2.0-notes.md | 24 ++++++++++----------
 cuda_core/examples/saxpy.py                  |  2 +-
 cuda_core/examples/strided_memory_view.py    |  2 +-
 cuda_core/examples/thread_block_cluster.py   |  2 +-
 cuda_core/examples/vector_add.py             |  2 +-
 cuda_core/tests/test_linker.py               | 14 +++++++-----
 8 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 2beeb168..570a6b04 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -9,6 +9,7 @@
 from typing import List, Optional
 
 from cuda import cuda
+from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import check_or_create_options, handle_return
 
@@ -84,10 +85,10 @@ class LinkerOptions:
 
     Attributes
     ----------
-    arch : str
+    arch : str, optional
         Pass the SM architecture value, such as ``-arch=sm_<CC>`` (for generating CUBIN) or
-        ``compute_<CC>`` (for generating PTX).
-        This is a required option.
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
     max_register_count : int, optional
         Maximum register count.
         Maps to: ``-maxrregcount=<N>``.
@@ -165,7 +166,7 @@ class LinkerOptions:
         Default: False.
     """
 
-    arch: str
+    arch: Optional[str] = None
     max_register_count: Optional[int] = None
     time: Optional[bool] = None
     verbose: Optional[bool] = None
@@ -197,6 +198,8 @@ def __post_init__(self):
     def _init_nvjitlink(self):
         if self.arch is not None:
             self.formatted_options.append(f"-arch={self.arch}")
+        else:
+            self.formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability))
         if self.max_register_count is not None:
             self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
         if self.time is not None:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 08569f33..53cc6a22 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -181,7 +181,7 @@ class ProgramOptions:
         Maps to: --minimal (-minimal)
     """
 
-    gpu_architecture: Optional[str] = None
+    arch: Optional[str] = None
     device_c: Optional[bool] = None
     device_w: Optional[bool] = None
     relocatable_device_code: Optional[bool] = None
@@ -224,8 +224,8 @@ class ProgramOptions:
 
     def __post_init__(self):
         self._formatted_options = []
-        if self.gpu_architecture is not None:
-            self._formatted_options.append(f"--gpu-architecture={self.gpu_architecture}")
+        if self.arch is not None:
+            self._formatted_options.append(f"--gpu-architecture={self.arch}")
         else:
             self._formatted_options.append(
                 "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability)
diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
index 3ba043ce..a2091d4d 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.md
+++ b/cuda_core/docs/source/release/0.2.0-notes.md
@@ -1,12 +1,12 @@
-# `cuda.core` Release notes
-
-Released on <TODO>, 2024
-
-## Hightlights
-- Add ProgramOptions to facilitate the passing of runtime compile options to `Program`
-
-## Limitations
--<TODO>
-
-## Breaking Changes
-- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`.
+# `cuda.core` Release notes
+
+Released on <TODO>, 2024
+
+## Hightlights
+- Add ProgramOptions to facilitate the passing of runtime compile options to `Program`
+
+## Limitations
+-<TODO>
+
+## Breaking Changes
+- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`.
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index 56db78da..ad8de57d 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -29,7 +29,7 @@
 s = dev.create_stream()
 
 # prepare program
-program_options = ProgramOptions(std="c++11", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+program_options = ProgramOptions(std="c++11", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability))
 prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile(
     "cubin",
diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 12b917ae..ed970d50 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -93,7 +93,7 @@
     dev = Device(0)
     dev.set_current()
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++11"))
+    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
     mod = gpu_prog.compile(target_type="cubin")
     gpu_ker = mod.get_kernel(func_name)
 
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index 7b0182cd..f4333844 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -47,7 +47,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(gpu_architecture=f"sm_{arch}", std="c++17", include_paths=cuda_include_path),
+    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_paths=cuda_include_path),
 )
 mod = prog.compile(target_type="cubin")
 ker = mod.get_kernel("check_cluster_info")
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index b12db84d..5a42ab7b 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -26,7 +26,7 @@
 s = dev.create_stream()
 
 # prepare program
-program_options = ProgramOptions(std="c++17", gpu_architecture="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+program_options = ProgramOptions(std="c++17", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability))
 prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 83ce5208..4d1f5731 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,10 +4,11 @@
 
 import pytest
 
-from cuda.core.experimental import Linker, LinkerOptions, Program, ProgramOptions, _linker
+from cuda.bindings import nvjitlink
+from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
 from cuda.core.experimental._module import ObjectCode
 
-ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
+ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
 kernel_a = """
 extern __device__ int B();
@@ -45,6 +46,7 @@ def compile_ltoir_functions(init_cuda):
 
 
 culink_options = [
+    LinkerOptions(),
     LinkerOptions(arch=ARCH, verbose=True),
     LinkerOptions(arch=ARCH, max_register_count=32),
     LinkerOptions(arch=ARCH, optimization_level=3),
@@ -81,10 +83,10 @@ def test_linker_init(compile_ptx_functions, options):
     assert isinstance(object_code, ObjectCode)
 
 
-def test_linker_init_invalid_arch():
-    options = LinkerOptions(arch=None)
-    with pytest.raises(TypeError):
-        Linker(options)
+def test_linker_init_invalid_arch(compile_ptx_functions):
+    options = LinkerOptions(arch="99", ptx=True)
+    with pytest.raises(nvjitlink.nvJitLinkError):
+        Linker(*compile_ptx_functions, options=options)
 
 
 @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")

From 2285facdaff03b13b7e3d6ba47d601289dd381ab Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 1 Jan 2025 13:52:11 -0800
Subject: [PATCH 09/44] update the tests

---
 cuda_core/tests/test_linker.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 4d1f5731..f4e71c43 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -85,8 +85,12 @@ def test_linker_init(compile_ptx_functions, options):
 
 def test_linker_init_invalid_arch(compile_ptx_functions):
     options = LinkerOptions(arch="99", ptx=True)
-    with pytest.raises(nvjitlink.nvJitLinkError):
-        Linker(*compile_ptx_functions, options=options)
+    if culink_backend:
+        with pytest.raises(AttributeError):
+            Linker(*compile_ptx_functions, options=options)
+    else:
+        with pytest.raises(nvjitlink.nvJitLinkError):
+            Linker(*compile_ptx_functions, options=options)
 
 
 @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")

From bb6204838b5ccb4f0ffccad4a86effb4c7c3d10b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 2 Jan 2025 11:39:39 -0800
Subject: [PATCH 10/44] move ProgramOptions ctor into pytest raises

---
 cuda_core/tests/test_linker.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index f4e71c43..9a318651 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -52,7 +52,8 @@ def compile_ltoir_functions(init_cuda):
     LinkerOptions(arch=ARCH, optimization_level=3),
     LinkerOptions(arch=ARCH, debug=True),
     LinkerOptions(arch=ARCH, lineinfo=True),
-    LinkerOptions(arch=ARCH, no_cache=True),
+    LinkerOptions(arch=ARCH, no_cache=True),  # TODO: consider adding cuda 12.4 to test matrix in which case this
+    # will fail. Tracked in issue #337
 ]
 
 
@@ -78,18 +79,20 @@ def compile_ltoir_functions(init_cuda):
     ],
 )
 def test_linker_init(compile_ptx_functions, options):
+    print(culink_backend)
     linker = Linker(*compile_ptx_functions, options=options)
     object_code = linker.link("cubin")
     assert isinstance(object_code, ObjectCode)
 
 
 def test_linker_init_invalid_arch(compile_ptx_functions):
-    options = LinkerOptions(arch="99", ptx=True)
     if culink_backend:
         with pytest.raises(AttributeError):
+            options = LinkerOptions(arch="99", ptx=True)
             Linker(*compile_ptx_functions, options=options)
     else:
         with pytest.raises(nvjitlink.nvJitLinkError):
+            options = LinkerOptions(arch="99", ptx=True)
             Linker(*compile_ptx_functions, options=options)
 
 

From 0f0ca9b9477995631113df7aaec2918ee2a4a655 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 2 Jan 2025 12:00:59 -0800
Subject: [PATCH 11/44] only import nvjitlink if its available

---
 cuda_core/tests/test_linker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 9a318651..0073c4c9 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,7 +4,6 @@
 
 import pytest
 
-from cuda.bindings import nvjitlink
 from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
 from cuda.core.experimental._module import ObjectCode
 
@@ -19,6 +18,8 @@
 device_function_c = "__device__ int C(int a, int b) { return a + b; }"
 
 culink_backend = _linker._decide_nvjitlink_or_driver()
+if not culink_backend:
+    from cuda.bindings import nvjitlink
 
 
 @pytest.fixture(scope="function")

From 58f2b09f88b275c95009eeb692aa4bfac2886c24 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 14:53:51 -0800
Subject: [PATCH 12/44] Update cuda_core/examples/saxpy.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/examples/saxpy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index ad8de57d..8fc4b91b 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -29,7 +29,8 @@
 s = dev.create_stream()
 
 # prepare program
-program_options = ProgramOptions(std="c++11", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+arch = "".join(f"{i}" for i in dev.compute_capability)
+program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
 prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile(
     "cubin",

From 7afe54e9d84650ec7f2055d356edb3c94eb04810 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 14:54:19 -0800
Subject: [PATCH 13/44] Update cuda_core/cuda/core/experimental/_program.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_program.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 53cc6a22..c8de3f86 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -403,10 +403,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         else:
             raise NotImplementedError
 
-        if options is None:
-            self._options = []
-        else:
-            self._options = options._as_bytes()
+        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
 
     def close(self):
         """Destroy this program."""

From 653a3e14b4315e7d0e175d843b81ab5aadd79bf9 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 14:54:27 -0800
Subject: [PATCH 14/44] Update cuda_core/cuda/core/experimental/_program.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index c8de3f86..034ffafa 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -439,7 +439,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                 for n in name_expressions:
                     handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle)
             handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options),
+                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options._as_bytes()),
                 handle=self._mnff.handle,
             )
 

From 5161a43cef5fa619d12ca49edac5bbd8d9ed1805 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 14:54:42 -0800
Subject: [PATCH 15/44] Update cuda_core/cuda/core/experimental/_program.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_program.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 034ffafa..bdcff3ce 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -342,10 +342,7 @@ def __post_init__(self):
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-        result = []
-        for option in self._formatted_options:
-            result.append(option.encode())
-        return result
+        return list(o.encode() for o in self._formatted_options)
 
     def __repr__(self):
         # __TODO__ improve this

From f461eec29f15f1595d26781a9cab95c2d6051e7c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 15:11:29 -0800
Subject: [PATCH 16/44] tweak doc source

---
 cuda_core/docs/source/index.rst                       |  2 +-
 cuda_core/docs/source/release.md                      | 11 -----------
 cuda_core/docs/source/release.rst                     | 11 +++++++++++
 .../release/{0.1.0-notes.md => 0.1.0-notes.rst}       |  5 ++++-
 .../release/{0.1.1-notes.md => 0.1.1-notes.rst}       |  5 ++++-
 .../release/{0.2.0-notes.md => 0.2.0-notes.rst}       |  3 +++
 6 files changed, 23 insertions(+), 14 deletions(-)
 delete mode 100644 cuda_core/docs/source/release.md
 create mode 100644 cuda_core/docs/source/release.rst
 rename cuda_core/docs/source/release/{0.1.0-notes.md => 0.1.0-notes.rst} (86%)
 rename cuda_core/docs/source/release/{0.1.1-notes.md => 0.1.1-notes.rst} (95%)
 rename cuda_core/docs/source/release/{0.2.0-notes.md => 0.2.0-notes.rst} (85%)

diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst
index 19c14c2a..1aedfaa8 100644
--- a/cuda_core/docs/source/index.rst
+++ b/cuda_core/docs/source/index.rst
@@ -8,7 +8,7 @@ and other functionalities.
    :maxdepth: 2
    :caption: Contents:
 
-   release.md
+   release.rst
    install.md
    interoperability.rst
    api.rst
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
deleted file mode 100644
index fa32175f..00000000
--- a/cuda_core/docs/source/release.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Release Notes
-
-```{toctree}
----
-maxdepth: 3
----
-
-    0.1.1 <release/0.1.1-notes>
-    0.1.0 <release/0.1.0-notes>
-    0.2.0 <release/0.2.0-notes>
-```
diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst
new file mode 100644
index 00000000..f2ae8ea0
--- /dev/null
+++ b/cuda_core/docs/source/release.rst
@@ -0,0 +1,11 @@
+# Release Notes
+
+Release Notes
+=============
+
+.. toctree::
+   :maxdepth: 3
+
+   release/0.2.0-notes
+   release/0.1.1-notes
+   release/0.1.0-notes
diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.rst
similarity index 86%
rename from cuda_core/docs/source/release/0.1.0-notes.md
rename to cuda_core/docs/source/release/0.1.0-notes.rst
index 5d1c7fd5..83c4e56b 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -1,4 +1,7 @@
-# `cuda.core` v0.1.0 Release notes
+# `cuda.core` Release notes
+
+`cuda.core` 0.1.0 Release Notes
+===============================
 
 Released on Nov 8, 2024
 
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.rst
similarity index 95%
rename from cuda_core/docs/source/release/0.1.1-notes.md
rename to cuda_core/docs/source/release/0.1.1-notes.rst
index 9550cb23..8903a32a 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.md
+++ b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -1,4 +1,7 @@
-# `cuda.core` v0.1.1 Release notes
+# `cuda.core` Release notes
+
+`cuda.core` 0.1.1 Release Notes
+===============================
 
 Released on Dec 20, 2024
 
diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.rst
similarity index 85%
rename from cuda_core/docs/source/release/0.2.0-notes.md
rename to cuda_core/docs/source/release/0.2.0-notes.rst
index a2091d4d..852875b2 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.md
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -1,5 +1,8 @@
 # `cuda.core` Release notes
 
+`cuda.core` 0.2.0 Release Notes
+===============================
+
 Released on <TODO>, 2024
 
 ## Hightlights

From 236db71429f9b70fe0f28984b229751124f878da Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 15:28:51 -0800
Subject: [PATCH 17/44] tweak docs

---
 cuda_core/docs/source/release/0.1.0-notes.rst | 13 +++++-----
 cuda_core/docs/source/release/0.1.1-notes.rst | 25 +++++++++++--------
 cuda_core/docs/source/release/0.2.0-notes.rst | 20 +++++++++------
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst
index 83c4e56b..fb1d8b77 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.rst
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -1,20 +1,21 @@
-# `cuda.core` Release notes
-
 `cuda.core` 0.1.0 Release Notes
 ===============================
 
 Released on Nov 8, 2024
 
-## Hightlights
+Highlights
+----------
+
 - Initial beta release
 - Supports all platforms that CUDA is supported
 - Supports all CUDA 11.x/12.x drivers
 - Supports all CUDA 11.x/12.x Toolkits
 - Pythonic CUDA runtime and other core functionalities
 
-## Limitations
+Limitations
+-----------
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+  Please kindly share your feedback with us so that we can make `cuda.core` better!
 - Source code release only; `pip`/`conda` support is coming in a future release
-- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
+- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
\ No newline at end of file
diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst
index 8903a32a..639a0559 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.rst
+++ b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -1,11 +1,10 @@
-# `cuda.core` Release notes
-
 `cuda.core` 0.1.1 Release Notes
 ===============================
 
 Released on Dec 20, 2024
 
-## Hightlights
+Highlights
+----------
 
 - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
   implementation of DLPack & CUDA Array Interface supports.
@@ -14,12 +13,14 @@ Released on Dec 20, 2024
   detected in the current environment.
 - Support `pip install cuda-core`. Please see the Installation Guide for further details.
 
-## New features
+New features
+------------
 
-- Add a `cuda.core.experimental.system` module for querying system- or process- wide information.
+- Add a `cuda.core.experimental.system` module for querying system- or process-wide information.
 - Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.
 
-## Enhancements
+Enhancements
+------------
 
 - The internal handle held by `ObjectCode` is now lazily initialized upon first touch.
 - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
@@ -28,19 +29,21 @@ Released on Dec 20, 2024
 - Improve test coverage & documentation cross-references.
 - Enforce code formatting.
 
-## Bug fixes
+Bug fixes
+---------
 
 - Eliminate potential class destruction issues.
 - Fix circular import during handling a foreign CUDA stream.
 
-## Limitations
+Limitations
+-----------
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+  Please kindly share your feedback with us so that we can make `cuda.core` better!
 - Using `cuda.core` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
   not supported. This will be fixed in a future release.
 - Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
-  the backend is the cuLink api which supports only a subset of the options that nvjitlink does.
+  the backend is the cuLink API which supports only a subset of the options that nvjitlink does.
   Further, some options aren't available on CUDA versions <12.6.
 - To use `cuda.core` with Python 3.13, it currently requires building `cuda-python` from source
-  prior to `pip install`. This extra step will be fixed soon.
+  prior to `pip install`. This extra step will be fixed soon.
\ No newline at end of file
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 852875b2..75ed3f95 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -1,15 +1,19 @@
-# `cuda.core` Release notes
-
 `cuda.core` 0.2.0 Release Notes
 ===============================
 
 Released on <TODO>, 2024
 
-## Hightlights
-- Add ProgramOptions to facilitate the passing of runtime compile options to `Program`
+Highlights
+----------
+
+- Add `ProgramOptions` to facilitate the passing of runtime compile options to `Program`.
+
+Limitations
+-----------
+
+- <TODO>
 
-## Limitations
--<TODO>
+Breaking Changes
+----------------
 
-## Breaking Changes
-- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`.
+- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`.
\ No newline at end of file

From 2f74ca3c53b09bbe7eafc0d471db8b2beb76a3c7 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 3 Jan 2025 15:43:58 -0800
Subject: [PATCH 18/44] tweak fix

---
 cuda_core/cuda/core/experimental/_program.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index bdcff3ce..0e8c8207 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -9,7 +9,7 @@
 from cuda import nvrtc
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import _handle_boolean_option, handle_return
+from cuda.core.experimental._utils import _handle_boolean_option, check_or_create_options, handle_return
 
 
 @dataclass
@@ -435,8 +435,9 @@ def compile(self, target_type, name_expressions=(), logs=None):
             if name_expressions:
                 for n in name_expressions:
                     handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle)
+            options = self._options._as_bytes()
             handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(self._options), self._options._as_bytes()),
+                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
                 handle=self._mnff.handle,
             )
 

From 9c88ba72629be37e6a7a838c9687d83672df47b0 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:01:52 -0800
Subject: [PATCH 19/44] Update cuda_core/cuda/core/experimental/_linker.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 81357e07..54b14395 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -94,7 +94,7 @@ class LinkerOptions:
     Attributes
     ----------
     arch : str, optional
-        Pass the SM architecture value, such as ``-arch=sm_<CC>`` (for generating CUBIN) or
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
         ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
         will be used.
     max_register_count : int, optional

From 133f6aa42ee581edfb59f70ff0171f27bf5e703c Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:02:03 -0800
Subject: [PATCH 20/44] Update cuda_core/docs/source/release.rst

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/docs/source/release.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst
index f2ae8ea0..4e74423f 100644
--- a/cuda_core/docs/source/release.rst
+++ b/cuda_core/docs/source/release.rst
@@ -1,5 +1,3 @@
-# Release Notes
-
 Release Notes
 =============
 

From cb06afcdc29a277a6b170e44cf32887647910fa9 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:02:47 -0800
Subject: [PATCH 21/44] Update cuda_core/docs/source/release/0.1.0-notes.rst

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/docs/source/release/0.1.0-notes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst
index fb1d8b77..a342ccdd 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.rst
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -18,4 +18,4 @@ Limitations
 - All APIs are currently *experimental* and subject to change without deprecation notice.
   Please kindly share your feedback with us so that we can make `cuda.core` better!
 - Source code release only; `pip`/`conda` support is coming in a future release
-- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
\ No newline at end of file
+- Windows TCC mode is `not yet supported <https://github.com/NVIDIA/cuda-python/issues/206>`_
\ No newline at end of file

From 3578f949d24794be2b8c0044b1661012a2217b6f Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:19:04 -0800
Subject: [PATCH 22/44] fix tests

---
 cuda_core/tests/test_program.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index fd2515b9..3f4e6b85 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -17,8 +17,6 @@ def test_program_with_various_options(init_cuda):
     code = 'extern "C" __global__ void my_kernel() {}'
 
     options_list = [
-        ProgramOptions(ptxas_options="-v"),
-        ProgramOptions(ptxas_options=["-v", "-O3"]),
         ProgramOptions(device_code_optimize=True, debug=True),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
@@ -31,21 +29,20 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]),
         ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]),
         ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"),
-        ProgramOptions(pre_include="my_header.h", no_source_include=True),
         ProgramOptions(builtin_initializer_list=False, disable_warnings=True),
         ProgramOptions(restrict=True, device_as_default_execution_space=True),
         ProgramOptions(device_int128=True, optimization_info="inline"),
         ProgramOptions(no_display_error_number=True),
-        ProgramOptions(diag_error="1234", diag_suppress="5678"),
-        ProgramOptions(diag_warn="91011", brief_diagnostics=True),
+        ProgramOptions(diag_error="1234", diag_suppress="1234"),
+        ProgramOptions(diag_warn="1000", brief_diagnostics=True),
         ProgramOptions(time="compile_time.csv", split_compile=4),
         ProgramOptions(fdevice_syntax_only=True, minimal=True),
     ]
 
-    # TODO compile the program once the CI is set up
     for options in options_list:
         program = Program(code, "c++", options)
         assert program.backend == "nvrtc"
+        program.compile("ptx")
         program.close()
         assert program.handle is None
 
@@ -70,6 +67,7 @@ def test_program_init_invalid_code_format():
 
 
 # TODO: incorporate this check in Program
+# This is tested against the current device's arch
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
 def test_program_compile_valid_target_type():
     code = 'extern "C" __global__ void my_kernel() {}'

From 261588c215b7797f3d120c9683bdda2dbfaccc80 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:21:07 -0800
Subject: [PATCH 23/44] fix quotes

---
 cuda_core/docs/source/release/0.1.0-notes.rst |  4 +--
 cuda_core/docs/source/release/0.1.1-notes.rst | 28 +++++++++----------
 cuda_core/docs/source/release/0.2.0-notes.rst |  6 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst
index fb1d8b77..2fff35db 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.rst
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -1,4 +1,4 @@
-`cuda.core` 0.1.0 Release Notes
+``cuda.core`` 0.1.0 Release Notes
 ===============================
 
 Released on Nov 8, 2024
@@ -16,6 +16,6 @@ Limitations
 -----------
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedback with us so that we can make `cuda.core` better!
+  Please kindly share your feedback with us so that we can make ``cuda.core`` better!
 - Source code release only; `pip`/`conda` support is coming in a future release
 - Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
\ No newline at end of file
diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst
index 639a0559..d1d2b93d 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.rst
+++ b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -1,4 +1,4 @@
-`cuda.core` 0.1.1 Release Notes
+``cuda.core`` 0.1.1 Release Notes
 ===============================
 
 Released on Dec 20, 2024
@@ -6,26 +6,26 @@ Released on Dec 20, 2024
 Highlights
 ----------
 
-- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
+- Add ``StridedMemoryView`` and ``@args_viewable_as_strided_memory`` that provide a concrete
   implementation of DLPack & CUDA Array Interface supports.
-- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`. Under
-  the hood, it uses either the nvJitLink or driver (`cuLink*`) APIs depending on the CUDA version
+- Add ``Linker`` that can link one or multiple ``ObjectCode`` instances generated by ``Program``. Under
+  the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version
   detected in the current environment.
-- Support `pip install cuda-core`. Please see the Installation Guide for further details.
+- Support ``pip install cuda-core``. Please see the Installation Guide for further details.
 
 New features
 ------------
 
-- Add a `cuda.core.experimental.system` module for querying system- or process-wide information.
-- Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.
+- Add a ``cuda.core.experimental.system`` module for querying system- or process-wide information.
+- Add ``LaunchConfig.cluster`` to support thread block clusters on Hopper GPUs.
 
 Enhancements
 ------------
 
-- The internal handle held by `ObjectCode` is now lazily initialized upon first touch.
+- The internal handle held by ``ObjectCode`` is now lazily initialized upon first touch.
 - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
-- Ensure `"ltoir"` is a valid code type to `ObjectCode`.
-- Document the `__cuda_stream__` protocol.
+- Ensure ``"ltoir"`` is a valid code type to ``ObjectCode``.
+- Document the ``__cuda_stream__`` protocol.
 - Improve test coverage & documentation cross-references.
 - Enforce code formatting.
 
@@ -39,11 +39,11 @@ Limitations
 -----------
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedback with us so that we can make `cuda.core` better!
-- Using `cuda.core` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
+  Please kindly share your feedback with us so that we can make ``cuda.core`` better!
+- Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
   not supported. This will be fixed in a future release.
-- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
+- Some ``LinkerOptions`` are only available when using a modern version of CUDA. When using CUDA <12,
   the backend is the cuLink API which supports only a subset of the options that nvjitlink does.
   Further, some options aren't available on CUDA versions <12.6.
-- To use `cuda.core` with Python 3.13, it currently requires building `cuda-python` from source
+- To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source
   prior to `pip install`. This extra step will be fixed soon.
\ No newline at end of file
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 75ed3f95..38f1ef29 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -1,4 +1,4 @@
-`cuda.core` 0.2.0 Release Notes
+``cuda.core`` 0.2.0 Release Notes
 ===============================
 
 Released on <TODO>, 2024
@@ -6,7 +6,7 @@ Released on <TODO>, 2024
 Highlights
 ----------
 
-- Add `ProgramOptions` to facilitate the passing of runtime compile options to `Program`.
+- Add ``ProgramOptions`` to facilitate the passing of runtime compile options to ``Program``.
 
 Limitations
 -----------
@@ -16,4 +16,4 @@ Limitations
 Breaking Changes
 ----------------
 
-- The `Program.Compile` method no longer accepts an options argument. Instead, you can optionally pass an instance of `ProgramOptions` to the constructor of `Program`.
\ No newline at end of file
+- The ``Program.Compile`` method no longer accepts an options argument. Instead, you can optionally pass an instance of ``ProgramOptions`` to the constructor of ``Program``.
\ No newline at end of file

From 1abc9f69ca01cf7c4628cbc7a9e75e2331648a9a Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:22:19 -0800
Subject: [PATCH 24/44] remove print

---
 cuda_core/tests/test_linker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 0073c4c9..37424d7b 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -80,7 +80,6 @@ def compile_ltoir_functions(init_cuda):
     ],
 )
 def test_linker_init(compile_ptx_functions, options):
-    print(culink_backend)
     linker = Linker(*compile_ptx_functions, options=options)
     object_code = linker.link("cubin")
     assert isinstance(object_code, ObjectCode)

From d20dcfa1c6c845f2e36388c98fe2801ff7fb305b Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:23:36 -0800
Subject: [PATCH 25/44] Update cuda_core/cuda/core/experimental/_utils.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index 74488a53..f3d7306b 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -91,7 +91,7 @@ def _handle_boolean_option(option: bool) -> str:
     """
     Convert a boolean option to a string representation.
     """
-    return str(option).lower()
+    return "true" if bool(option) else "false"
 
 
 def precondition(checker: Callable[..., None], what: str = "") -> Callable:

From 79fad7ac771bc4bcc8901116b00bb9d14db73e67 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 09:24:23 -0800
Subject: [PATCH 26/44] fix titles

---
 cuda_core/docs/source/release/0.1.0-notes.rst | 2 +-
 cuda_core/docs/source/release/0.1.1-notes.rst | 2 +-
 cuda_core/docs/source/release/0.2.0-notes.rst | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst
index de14128d..a5affba2 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.rst
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -1,5 +1,5 @@
 ``cuda.core`` 0.1.0 Release Notes
-===============================
+=================================
 
 Released on Nov 8, 2024
 
diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst
index d1d2b93d..53ff2ce5 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.rst
+++ b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -1,5 +1,5 @@
 ``cuda.core`` 0.1.1 Release Notes
-===============================
+=================================
 
 Released on Dec 20, 2024
 
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 38f1ef29..8981a7c1 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -1,5 +1,5 @@
 ``cuda.core`` 0.2.0 Release Notes
-===============================
+=================================
 
 Released on <TODO>, 2024
 

From ec9fac1952d9923b3e467d9adb050352dfd6f161 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 10:34:41 -0800
Subject: [PATCH 27/44] remove some options

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 3f4e6b85..bb3ac47b 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -22,7 +22,7 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
         ProgramOptions(extra_device_vectorization=True),
-        ProgramOptions(link_time_optimization=True, gen_opt_lto=True),
+        ProgramOptions(link_time_optimization=True),
         ProgramOptions(define_macro="MY_MACRO"),
         ProgramOptions(define_macro=("MY_MACRO", "99")),
         ProgramOptions(define_macro=[("MY_MACRO", "99")]),
@@ -34,7 +34,7 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(device_int128=True, optimization_info="inline"),
         ProgramOptions(no_display_error_number=True),
         ProgramOptions(diag_error="1234", diag_suppress="1234"),
-        ProgramOptions(diag_warn="1000", brief_diagnostics=True),
+        ProgramOptions(diag_warn="1000"),
         ProgramOptions(time="compile_time.csv", split_compile=4),
         ProgramOptions(fdevice_syntax_only=True, minimal=True),
     ]

From f55dcdcb4f69f9257b8d3f80db473a0a5f773a57 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 10:35:45 -0800
Subject: [PATCH 28/44] add TODO

---
 cuda_core/tests/test_program.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index bb3ac47b..4d06bf90 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -13,6 +13,7 @@
 from cuda.core.experimental._program import Program, ProgramOptions
 
 
+# TODO handle and test options whcih are only supported on more modern CUDA versions
 def test_program_with_various_options(init_cuda):
     code = 'extern "C" __global__ void my_kernel() {}'
 

From b41d119e9dc1b43dc9756daaaeb688be58cf1b94 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 10:54:33 -0800
Subject: [PATCH 29/44] remove option

---
 cuda_core/tests/test_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 4d06bf90..d4e67ccc 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -36,7 +36,7 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(no_display_error_number=True),
         ProgramOptions(diag_error="1234", diag_suppress="1234"),
         ProgramOptions(diag_warn="1000"),
-        ProgramOptions(time="compile_time.csv", split_compile=4),
+        ProgramOptions(split_compile=4),
         ProgramOptions(fdevice_syntax_only=True, minimal=True),
     ]
 

From de588dead8636a8dcb6b9f228ad691f9c09c241b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 11:17:12 -0800
Subject: [PATCH 30/44] remove options, should pass.

---
 cuda_core/tests/test_program.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index d4e67ccc..54947053 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -36,8 +36,6 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(no_display_error_number=True),
         ProgramOptions(diag_error="1234", diag_suppress="1234"),
         ProgramOptions(diag_warn="1000"),
-        ProgramOptions(split_compile=4),
-        ProgramOptions(fdevice_syntax_only=True, minimal=True),
     ]
 
     for options in options_list:

From 2fbca700983f238ac3dbfd8ae40b9e329407c4ea Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 11:17:49 -0800
Subject: [PATCH 31/44] add issue tracking info

---
 cuda_core/tests/test_program.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 54947053..bbfc4079 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -14,6 +14,7 @@
 
 
 # TODO handle and test options whcih are only supported on more modern CUDA versions
+# tracked in #337
 def test_program_with_various_options(init_cuda):
     code = 'extern "C" __global__ void my_kernel() {}'
 

From b79dcccb49481352e496291c1808fec9c137dfb4 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 6 Jan 2025 12:28:40 -0800
Subject: [PATCH 32/44] fix include path argument

---
 cuda_core/examples/thread_block_cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index f4333844..dfbe71a5 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -47,7 +47,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_paths=cuda_include_path),
+    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path),
 )
 mod = prog.compile(target_type="cubin")
 ker = mod.get_kernel("check_cluster_info")

From bf32370cc653b42a3c6c6bdaa011d8c22211ae33 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 09:33:35 -0800
Subject: [PATCH 33/44] fix the rest format

---
 cuda_core/docs/source/release/0.1.0-notes.rst |  2 +-
 cuda_core/docs/source/release/0.1.1-notes.rst | 18 ++++++++++--------
 cuda_core/docs/source/release/0.2.0-notes.rst |  6 ++++--
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/cuda_core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.rst
index a5affba2..bc327971 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.rst
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -18,4 +18,4 @@ Limitations
 - All APIs are currently *experimental* and subject to change without deprecation notice.
   Please kindly share your feedback with us so that we can make ``cuda.core`` better!
 - Source code release only; `pip`/`conda` support is coming in a future release
-- Windows TCC mode is `not yet supported <https://github.com/NVIDIA/cuda-python/issues/206>`_
\ No newline at end of file
+- Windows TCC mode is `not yet supported <https://github.com/NVIDIA/cuda-python/issues/206>`_
diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst
index 53ff2ce5..68cca671 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.rst
+++ b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: cuda.core.experimental
+  
 ``cuda.core`` 0.1.1 Release Notes
 =================================
 
@@ -6,9 +8,9 @@ Released on Dec 20, 2024
 Highlights
 ----------
 
-- Add ``StridedMemoryView`` and ``@args_viewable_as_strided_memory`` that provide a concrete
+- Add :obj:`~utils.StridedMemoryView` and :func:`~utils.args_viewable_as_strided_memory` that provide a concrete
   implementation of DLPack & CUDA Array Interface supports.
-- Add ``Linker`` that can link one or multiple ``ObjectCode`` instances generated by ``Program``. Under
+- Add :obj:`~Linker` that can link one or multiple :obj:`~_module.ObjectCode` instances generated by :obj:`~Program`. Under
   the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version
   detected in the current environment.
 - Support ``pip install cuda-core``. Please see the Installation Guide for further details.
@@ -16,15 +18,15 @@ Highlights
 New features
 ------------
 
-- Add a ``cuda.core.experimental.system`` module for querying system- or process-wide information.
-- Add ``LaunchConfig.cluster`` to support thread block clusters on Hopper GPUs.
+- Add a :obj:`cuda.core.experiemental.system` module for querying system- or process-wide information.
+- Add :obj:`~LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.
 
 Enhancements
 ------------
 
-- The internal handle held by ``ObjectCode`` is now lazily initialized upon first touch.
+- The internal handle held by :obj:`~_module.ObjectCode` is now lazily initialized upon first touch.
 - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
-- Ensure ``"ltoir"`` is a valid code type to ``ObjectCode``.
+- Ensure ``"ltoir"`` is a valid code type to :obj:`~_module.ObjectCode`.
 - Document the ``__cuda_stream__`` protocol.
 - Improve test coverage & documentation cross-references.
 - Enforce code formatting.
@@ -42,8 +44,8 @@ Limitations
   Please kindly share your feedback with us so that we can make ``cuda.core`` better!
 - Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
   not supported. This will be fixed in a future release.
-- Some ``LinkerOptions`` are only available when using a modern version of CUDA. When using CUDA <12,
+- Some :class:`~LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
   the backend is the cuLink API which supports only a subset of the options that nvjitlink does.
   Further, some options aren't available on CUDA versions <12.6.
 - To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source
-  prior to `pip install`. This extra step will be fixed soon.
\ No newline at end of file
+  prior to `pip install`. This extra step will be fixed soon.
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 8981a7c1..3a8b5a78 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: cuda.core.experimental
+
 ``cuda.core`` 0.2.0 Release Notes
 =================================
 
@@ -6,7 +8,7 @@ Released on <TODO>, 2024
 Highlights
 ----------
 
-- Add ``ProgramOptions`` to facilitate the passing of runtime compile options to ``Program``.
+- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`.
 
 Limitations
 -----------
@@ -16,4 +18,4 @@ Limitations
 Breaking Changes
 ----------------
 
-- The ``Program.Compile`` method no longer accepts an options argument. Instead, you can optionally pass an instance of ``ProgramOptions`` to the constructor of ``Program``.
\ No newline at end of file
+- The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.

From 34c87805b9683444fc5aac351436eaf9f68251d3 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 10:42:12 -0800
Subject: [PATCH 34/44] handle nested tuples within lists and tuples, and fix
 the handling of error number options

---
 cuda_core/cuda/core/experimental/_program.py | 73 ++++++++++++--------
 cuda_core/cuda/core/experimental/_utils.py   | 16 ++++-
 2 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 0e8c8207..a685458a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -9,12 +9,17 @@
 from cuda import nvrtc
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import _handle_boolean_option, check_or_create_options, handle_return
+from cuda.core.experimental._utils import (
+    _handle_boolean_option,
+    check_or_create_options,
+    handle_return,
+    is_sequence,
+)
 
 
 @dataclass
 class ProgramOptions:
-    """Customizable :obj:`ProgramOptions` for NVRTC.
+    """Customizable options for configuring `Program`.
 
     Attributes
     ----------
@@ -147,16 +152,16 @@ class ProgramOptions:
         Disable the display of a diagnostic number for warning messages.
         Default: False
         Maps to: --no-display-error-number (-no-err-no)
-    diag_error : str, optional
-        Emit error for specified diagnostic message number(s).
+    diag_error : Union[int, List[int]], optional
+        Emit error for a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: --diag-error=<error-number>,… (-diag-error)
-    diag_suppress : str, optional
-        Suppress specified diagnostic message number(s).
+        Maps to: --diag-error=<error-number>, ... (-diag-error)
+    diag_suppress : Union[int, List[int]], optional
+        Suppress a specified diagnostic message number or comma separated list of numbers.
         Default: None
         Maps to: --diag-suppress=<error-number>,… (-diag-suppress)
-    diag_warn : str, optional
-        Emit warning for specified diagnostic message number(s).
+    diag_warn : Union[int, List[int]], optional
+        Emit warning for a specified diagnostic message number or comma separated lis of numbers.
         Default: None
         Maps to: --diag-warn=<error-number>,… (-diag-warn)
     brief_diagnostics : bool, optional
@@ -182,14 +187,12 @@ class ProgramOptions:
     """
 
     arch: Optional[str] = None
-    device_c: Optional[bool] = None
-    device_w: Optional[bool] = None
     relocatable_device_code: Optional[bool] = None
     extensible_whole_program: Optional[bool] = None
     debug: Optional[bool] = None
     lineinfo: Optional[bool] = None
     device_code_optimize: Optional[bool] = None
-    ptxas_options: Optional[Union[str, List[str]]] = None
+    ptxas_options: Optional[Union[str, List[str], Tuple[str]]] = None
     max_register_count: Optional[int] = None
     ftz: Optional[bool] = None
     prec_sqrt: Optional[bool] = None
@@ -199,10 +202,12 @@ class ProgramOptions:
     extra_device_vectorization: Optional[bool] = None
     link_time_optimization: Optional[bool] = None
     gen_opt_lto: Optional[bool] = None
-    define_macro: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None
-    undefine_macro: Optional[Union[str, List[str]]] = None
-    include_path: Optional[Union[str, List[str]]] = None
-    pre_include: Optional[Union[str, List[str]]] = None
+    define_macro: Optional[
+        Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]], Tuple[Union[str, Tuple[str, str]]]]
+    ] = None
+    undefine_macro: Optional[Union[str, List[str], Tuple[str]]] = None
+    include_path: Optional[Union[str, List[str], Tuple[str]]] = None
+    pre_include: Optional[Union[str, List[str], Tuple[str]]] = None
     no_source_include: Optional[bool] = None
     std: Optional[str] = None
     builtin_move_forward: Optional[bool] = None
@@ -213,9 +218,9 @@ class ProgramOptions:
     device_int128: Optional[bool] = None
     optimization_info: Optional[str] = None
     no_display_error_number: Optional[bool] = None
-    diag_error: Optional[str] = None
-    diag_suppress: Optional[str] = None
-    diag_warn: Optional[str] = None
+    diag_error: Optional[Union[int, List[int], Tuple[int]]] = None
+    diag_suppress: Optional[Union[int, List[int], Tuple[int]]] = None
+    diag_warn: Optional[Union[int, List[int], Tuple[int]]] = None
     brief_diagnostics: Optional[bool] = None
     time: Optional[str] = None
     split_compile: Optional[int] = None
@@ -244,11 +249,11 @@ def __post_init__(self):
             self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}")
         if self.ptxas_options is not None:
             self._formatted_options.append("--ptxas-options")
-            if isinstance(self.ptxas_options, list):
+            if is_sequence(self.ptxas_options):
                 for option in self.ptxas_options:
                     self._formatted_options.append(option)
             else:
-                self._formatted_options.append("self.ptxas_options")
+                self._formatted_options.append(self.ptxas_options)
         if self.max_register_count is not None:
             self._formatted_options.append(f"--maxrregcount={self.max_register_count}")
         if self.ftz is not None:
@@ -268,7 +273,7 @@ def __post_init__(self):
         if self.gen_opt_lto is not None and self.gen_opt_lto:
             self._formatted_options.append("--gen-opt-lto")
         if self.define_macro is not None:
-            if isinstance(self.define_macro, list):
+            if is_sequence(self.define_macro):
                 for macro in self.define_macro:
                     if isinstance(macro, tuple):
                         assert len(macro) == 2
@@ -282,19 +287,19 @@ def __post_init__(self):
                 self._formatted_options.append(f"--define-macro={self.define_macro}")
 
         if self.undefine_macro is not None:
-            if isinstance(self.undefine_macro, list):
+            if is_sequence(self.undefine_macro):
                 for macro in self.undefine_macro:
                     self._formatted_options.append(f"--undefine-macro={macro}")
             else:
                 self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
         if self.include_path is not None:
-            if isinstance(self.include_path, list):
+            if is_sequence(self.include_path):
                 for path in self.include_path:
                     self._formatted_options.append(f"--include-path={path}")
             else:
                 self._formatted_options.append(f"--include-path={self.include_path}")
         if self.pre_include is not None:
-            if isinstance(self.pre_include, list):
+            if is_sequence(self.pre_include):
                 for header in self.pre_include:
                     self._formatted_options.append(f"--pre-include={header}")
             else:
@@ -324,11 +329,23 @@ def __post_init__(self):
         if self.no_display_error_number is not None and self.no_display_error_number:
             self._formatted_options.append("--no-display-error-number")
         if self.diag_error is not None:
-            self._formatted_options.append(f"--diag-error={self.diag_error}")
+            if is_sequence(self.diag_error):
+                for error in self.diag_error:
+                    self._formatted_options.append(f"--diag-error={error}")
+            else:
+                self._formatted_options.append(f"--diag-error={self.diag_error}")
         if self.diag_suppress is not None:
-            self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
+            if is_sequence(self.diag_suppress):
+                for suppress in self.diag_suppress:
+                    self._formatted_options.append(f"--diag-suppress={suppress}")
+            else:
+                self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
         if self.diag_warn is not None:
-            self._formatted_options.append(f"--diag-warn={self.diag_warn}")
+            if is_sequence(self.diag_warn):
+                for warn in self.diag_warn:
+                    self._formatted_options.append(f"--diag-warn={warn}")
+            else:
+                self._formatted_options.append(f"--diag-warn={self.diag_warn}")
         if self.brief_diagnostics is not None:
             self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
         if self.time is not None:
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index f3d7306b..8cb6abaf 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import functools
-from collections import namedtuple
+from collections import Sequence, namedtuple
 from typing import Callable, Dict
 
 from cuda import cuda, cudart, nvrtc
@@ -141,3 +141,17 @@ def get_device_from_ctx(ctx_handle) -> int:
         assert ctx_handle == handle_return(cuda.cuCtxPopCurrent())
         handle_return(cuda.cuCtxPushCurrent(prev_ctx))
     return device_id
+
+
+def is_sequence(obj):
+    """
+    Check if the given object is a sequence (list or tuple).
+    """
+    return isinstance(obj, Sequence)
+
+
+def is_nested_sequence(obj):
+    """
+    Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
+    """
+    return is_sequence(obj) and any(is_sequence(elem) for elem in obj)

From cc259606e3a74ad52ae16049b483f18e55457006 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 11:29:56 -0800
Subject: [PATCH 35/44] change from sequence to list or tuple

---
 cuda_core/cuda/core/experimental/_program.py | 21 ++++++++++----------
 cuda_core/cuda/core/experimental/_utils.py   | 10 +++++-----
 cuda_core/tests/test_program.py              |  3 ++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index a685458a..bd5c18aa 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -13,7 +13,8 @@
     _handle_boolean_option,
     check_or_create_options,
     handle_return,
-    is_sequence,
+    is_list_or_tuple,
+    is_nested_list_or_tuple,
 )
 
 
@@ -249,7 +250,7 @@ def __post_init__(self):
             self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}")
         if self.ptxas_options is not None:
             self._formatted_options.append("--ptxas-options")
-            if is_sequence(self.ptxas_options):
+            if is_list_or_tuple(self.ptxas_options):
                 for option in self.ptxas_options:
                     self._formatted_options.append(option)
             else:
@@ -273,7 +274,7 @@ def __post_init__(self):
         if self.gen_opt_lto is not None and self.gen_opt_lto:
             self._formatted_options.append("--gen-opt-lto")
         if self.define_macro is not None:
-            if is_sequence(self.define_macro):
+            if is_nested_list_or_tuple(self.define_macro):
                 for macro in self.define_macro:
                     if isinstance(macro, tuple):
                         assert len(macro) == 2
@@ -282,24 +283,24 @@ def __post_init__(self):
                         self._formatted_options.append(f"--define-macro={macro}")
             elif isinstance(self.define_macro, tuple):
                 assert len(self.define_macro) == 2
-                self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}")
+                self._formatted_options.append("--define-macro=MY_MACRO=999")
             else:
                 self._formatted_options.append(f"--define-macro={self.define_macro}")
 
         if self.undefine_macro is not None:
-            if is_sequence(self.undefine_macro):
+            if is_list_or_tuple(self.undefine_macro):
                 for macro in self.undefine_macro:
                     self._formatted_options.append(f"--undefine-macro={macro}")
             else:
                 self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
         if self.include_path is not None:
-            if is_sequence(self.include_path):
+            if is_list_or_tuple(self.include_path):
                 for path in self.include_path:
                     self._formatted_options.append(f"--include-path={path}")
             else:
                 self._formatted_options.append(f"--include-path={self.include_path}")
         if self.pre_include is not None:
-            if is_sequence(self.pre_include):
+            if is_list_or_tuple(self.pre_include):
                 for header in self.pre_include:
                     self._formatted_options.append(f"--pre-include={header}")
             else:
@@ -329,19 +330,19 @@ def __post_init__(self):
         if self.no_display_error_number is not None and self.no_display_error_number:
             self._formatted_options.append("--no-display-error-number")
         if self.diag_error is not None:
-            if is_sequence(self.diag_error):
+            if is_list_or_tuple(self.diag_error):
                 for error in self.diag_error:
                     self._formatted_options.append(f"--diag-error={error}")
             else:
                 self._formatted_options.append(f"--diag-error={self.diag_error}")
         if self.diag_suppress is not None:
-            if is_sequence(self.diag_suppress):
+            if is_list_or_tuple(self.diag_suppress):
                 for suppress in self.diag_suppress:
                     self._formatted_options.append(f"--diag-suppress={suppress}")
             else:
                 self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
         if self.diag_warn is not None:
-            if is_sequence(self.diag_warn):
+            if is_list_or_tuple(self.diag_warn):
                 for warn in self.diag_warn:
                     self._formatted_options.append(f"--diag-warn={warn}")
             else:
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index 8cb6abaf..9e64311b 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import functools
-from collections import Sequence, namedtuple
+from collections import namedtuple
 from typing import Callable, Dict
 
 from cuda import cuda, cudart, nvrtc
@@ -143,15 +143,15 @@ def get_device_from_ctx(ctx_handle) -> int:
     return device_id
 
 
-def is_sequence(obj):
+def is_list_or_tuple(obj):
     """
     Check if the given object is a sequence (list or tuple).
     """
-    return isinstance(obj, Sequence)
+    return isinstance(obj, (list, tuple))
 
 
-def is_nested_sequence(obj):
+def is_nested_list_or_tuple(obj):
     """
     Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
     """
-    return is_sequence(obj) and any(is_sequence(elem) for elem in obj)
+    return is_list_or_tuple(obj) and any(is_list_or_tuple(elem) for elem in obj)
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index bbfc4079..0fbd5866 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -35,7 +35,8 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(restrict=True, device_as_default_execution_space=True),
         ProgramOptions(device_int128=True, optimization_info="inline"),
         ProgramOptions(no_display_error_number=True),
-        ProgramOptions(diag_error="1234", diag_suppress="1234"),
+        ProgramOptions(diag_error=1234, diag_suppress=1234),
+        ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)),
         ProgramOptions(diag_warn="1000"),
     ]
 

From e4786b2d34aa0dc0353728936f12be7fb068fe19 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 11:34:14 -0800
Subject: [PATCH 36/44] Update cuda_core/cuda/core/experimental/_program.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_program.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index bd5c18aa..6dd807b1 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -25,12 +25,9 @@ class ProgramOptions:
     Attributes
     ----------
     arch : str, optional
-        Specify the name of the class of GPU architectures for which the input must be compiled.
-        Valid values: compute_50, compute_52, compute_53, compute_60, compute_61, compute_62, compute_70, compute_72,
-        compute_75, compute_80, compute_87, compute_89, compute_90, compute_90a, sm_50, sm_52, sm_53, sm_60, sm_61,
-        sm_62, sm_70, sm_72, sm_75, sm_80, sm_87, sm_89, sm_90, sm_90a.
-        Default: compute_52
-        Maps to: --gpu-architecture=<arch> (-arch)
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
     relocatable_device_code : bool, optional
         Enable (disable) the generation of relocatable device code.
         Default: False

From d913e0a3dba3413538dc450a5938bf9a3f0507ba Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 11:37:25 -0800
Subject: [PATCH 37/44] fix quotes

---
 cuda_core/cuda/core/experimental/_program.py | 78 ++++++++++----------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 6dd807b1..7646df9b 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -20,9 +20,7 @@
 
 @dataclass
 class ProgramOptions:
-    """Customizable options for configuring `Program`.
-
-    Attributes
+    """Attributes
     ----------
     arch : str, optional
         Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
@@ -31,157 +29,157 @@ class ProgramOptions:
     relocatable_device_code : bool, optional
         Enable (disable) the generation of relocatable device code.
         Default: False
-        Maps to: --relocatable-device-code={true|false} (-rdc)
+        Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``)
     extensible_whole_program : bool, optional
         Do extensible whole program compilation of device code.
         Default: False
-        Maps to: --extensible-whole-program (-ewp)
+        Maps to: ``--extensible-whole-program`` (``-ewp``)
     debug : bool, optional
         Generate debug information. If --dopt is not specified, then turns off all optimizations.
         Default: False
-        Maps to: --device-debug (-G)
+        Maps to: ``--device-debug`` (``-G``)
     lineinfo: bool, optional
         Generate line-number information.
         Default: False
-        Maps to: --generate-line-info (-lineinfo)
+        Maps to: ``--generate-line-info`` (``-lineinfo``)
     device_code_optimize : bool, optional
         Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
         for optimized device code.
         Default: None
-        Maps to: --dopt on (-dopt)
+        Maps to: ``--dopt on`` (``-dopt``)
     ptxas_options : Union[str, List[str]], optional
         Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
         For example ["-v", "-O2"].
         Default: None
-        Maps to: --ptxas-options <options> (-Xptxas)
+        Maps to: ``--ptxas-options <options>`` (``-Xptxas``)
     max_register_count : int, optional
         Specify the maximum amount of registers that GPU functions can use.
         Default: None
-        Maps to: --maxrregcount=<N> (-maxrregcount)
+        Maps to: ``--maxrregcount=<N>`` (``-maxrregcount``)
     ftz : bool, optional
         When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
         values.
         Default: False
-        Maps to: --ftz={true|false} (-ftz)
+        Maps to: ``--ftz={true|false}`` (``-ftz``)
     prec_sqrt : bool, optional
         For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
         Default: True
-        Maps to: --prec-sqrt={true|false} (-prec-sqrt)
+        Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``)
     prec_div : bool, optional
         For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
         approximation.
         Default: True
-        Maps to: --prec-div={true|false} (-prec-div)
+        Maps to: ``--prec-div={true|false}`` (``-prec-div``)
     fma : bool, optional
         Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
         multiply-add operations.
         Default: True
-        Maps to: --fmad={true|false} (-fmad)
+        Maps to: ``--fmad={true|false}`` (``-fmad``)
     use_fast_math : bool, optional
         Make use of fast math operations.
         Default: False
-        Maps to: --use_fast_math (-use_fast_math)
+        Maps to: ``--use_fast_math`` (``-use_fast_math``)
     extra_device_vectorization : bool, optional
         Enables more aggressive device code vectorization in the NVVM optimizer.
         Default: False
-        Maps to: --extra-device-vectorization (-extra-device-vectorization)
+        Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``)
     link_time_optimization : bool, optional
         Generate intermediate code for later link-time optimization.
         Default: False
-        Maps to: --dlink-time-opt (-dlto)
+        Maps to: ``--dlink-time-opt`` (``-dlto``)
     gen_opt_lto : bool, optional
         Run the optimizer passes before generating the LTO IR.
         Default: False
-        Maps to: --gen-opt-lto (-gen-opt-lto)
+        Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``)
     define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional
         Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
         strings, in which case the first element is defined as the second, or a list of strings or tuples.
         Default: None
-        Maps to: --define-macro=<def> (-D)
+        Maps to: ``--define-macro=<def>`` (``-D``)
     undefine_macro : Union[str, List[str]], optional
         Cancel any previous definition of a macro, or list of macros.
         Default: None
-        Maps to: --undefine-macro=<def> (-U)
+        Maps to: ``--undefine-macro=<def>`` (``-U``)
     include_path : Union[str, List[str]], optional
         Add the directory or directories to the list of directories to be searched for headers.
         Default: None
-        Maps to: --include-path=<dir> (-I)
+        Maps to: ``--include-path=<dir>`` (``-I``)
     pre_include : Union[str, List[str]], optional
         Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
         Default: None
-        Maps to: --pre-include=<header> (-include)
+        Maps to: ``--pre-include=<header>`` (``-include``)
     no_source_include : bool, optional
         Disable the default behavior of adding the directory of each input source to the include path.
         Default: False
-        Maps to: --no-source-include (-no-source-include)
+        Maps to: ``--no-source-include`` (``-no-source-include``)
     std : str, optional
         Set language dialect to C++03, C++11, C++14, C++17 or C++20.
         Default: c++17
-        Maps to: --std={c++03|c++11|c++14|c++17|c++20} (-std)
+        Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``)
     builtin_move_forward : bool, optional
         Provide builtin definitions of std::move and std::forward.
         Default: True
-        Maps to: --builtin-move-forward={true|false} (-builtin-move-forward)
+        Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``)
     builtin_initializer_list : bool, optional
         Provide builtin definitions of std::initializer_list class and member functions.
         Default: True
-        Maps to: --builtin-initializer-list={true|false} (-builtin-initializer-list)
+        Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``)
     disable_warnings : bool, optional
         Inhibit all warning messages.
         Default: False
-        Maps to: --disable-warnings (-w)
+        Maps to: ``--disable-warnings`` (``-w``)
     restrict : bool, optional
         Programmer assertion that all kernel pointer parameters are restrict pointers.
         Default: False
-        Maps to: --restrict (-restrict)
+        Maps to: ``--restrict`` (``-restrict``)
     device_as_default_execution_space : bool, optional
         Treat entities with no execution space annotation as __device__ entities.
         Default: False
-        Maps to: --device-as-default-execution-space (-default-device)
+        Maps to: ``--device-as-default-execution-space`` (``-default-device``)
     device_int128 : bool, optional
         Allow the __int128 type in device code.
         Default: False
-        Maps to: --device-int128 (-device-int128)
+        Maps to: ``--device-int128`` (``-device-int128``)
     optimization_info : str, optional
         Provide optimization reports for the specified kind of optimization.
         Default: None
-        Maps to: --optimization-info=<kind> (-opt-info)
+        Maps to: ``--optimization-info=<kind>`` (``-opt-info``)
     no_display_error_number : bool, optional
         Disable the display of a diagnostic number for warning messages.
         Default: False
-        Maps to: --no-display-error-number (-no-err-no)
+        Maps to: ``--no-display-error-number`` (``-no-err-no``)
     diag_error : Union[int, List[int]], optional
         Emit error for a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: --diag-error=<error-number>, ... (-diag-error)
+        Maps to: ``--diag-error=<error-number>, ...`` (``-diag-error``)
     diag_suppress : Union[int, List[int]], optional
         Suppress a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: --diag-suppress=<error-number>,… (-diag-suppress)
+        Maps to: ``--diag-suppress=<error-number>,…`` (``-diag-suppress``)
     diag_warn : Union[int, List[int]], optional
         Emit warning for a specified diagnostic message number or comma separated lis of numbers.
         Default: None
-        Maps to: --diag-warn=<error-number>,… (-diag-warn)
+        Maps to: ``--diag-warn=<error-number>,…`` (``-diag-warn``)
     brief_diagnostics : bool, optional
         Disable or enable showing source line and column info in a diagnostic.
         Default: False
-        Maps to: --brief-diagnostics={true|false} (-brief-diag)
+        Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``)
     time : str, optional
         Generate a CSV table with the time taken by each compilation phase.
         Default: None
-        Maps to: --time=<file-name> (-time)
+        Maps to: ``--time=<file-name>`` (``-time``)
     split_compile : int, optional
         Perform compiler optimizations in parallel.
         Default: 1
-        Maps to: --split-compile= <number of threads> (-split-compile)
+        Maps to: ``--split-compile= <number of threads>`` (``-split-compile``)
     fdevice_syntax_only : bool, optional
         Ends device compilation after front-end syntax checking.
         Default: False
-        Maps to: --fdevice-syntax-only (-fdevice-syntax-only)
+        Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``)
     minimal : bool, optional
         Omit certain language features to reduce compile time for small programs.
         Default: False
-        Maps to: --minimal (-minimal)
+        Maps to: ``--minimal`` (``-minimal``)
     """
 
     arch: Optional[str] = None

From 06c68b45ca7beb178c51e50199b3121a4904a9d8 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 11:38:02 -0800
Subject: [PATCH 38/44] fix quotes

---
 cuda_core/cuda/core/experimental/_program.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 7646df9b..789ef539 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -20,7 +20,9 @@
 
 @dataclass
 class ProgramOptions:
-    """Attributes
+    """Customizable options for configuring `Program`.
+
+    Attributes
     ----------
     arch : str, optional
         Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or

From 3a747f68bd1b49d3ea36986ed4bd561400243d9d Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 8 Jan 2025 11:40:28 -0800
Subject: [PATCH 39/44] swap api order

---
 cuda_core/docs/source/api.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 191760ae..5239174f 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -35,8 +35,8 @@ CUDA compilation toolchain
 
    :template: dataclass.rst
 
-   LinkerOptions
    ProgramOptions
+   LinkerOptions
 
 
 CUDA system information

From 9ca2f409d673d300574b99ccf8f3c9e568f0bbe3 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 13 Jan 2025 14:26:03 -0800
Subject: [PATCH 40/44] switch the order fo comparisons to use sequence instead
 of list / tuple

---
 cuda_core/cuda/core/experimental/_program.py | 53 ++++++++++----------
 cuda_core/cuda/core/experimental/_utils.py   |  9 ++--
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 789ef539..85c46df5 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -13,8 +13,8 @@
     _handle_boolean_option,
     check_or_create_options,
     handle_return,
-    is_list_or_tuple,
-    is_nested_list_or_tuple,
+    is_nested_sequence,
+    is_sequence,
 )
 
 
@@ -247,11 +247,11 @@ def __post_init__(self):
             self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}")
         if self.ptxas_options is not None:
             self._formatted_options.append("--ptxas-options")
-            if is_list_or_tuple(self.ptxas_options):
+            if isinstance(self.ptxas_options, str):
+                self._formatted_options.append(self.ptxas_options)
+            elif is_sequence(self.ptxas_options):
                 for option in self.ptxas_options:
                     self._formatted_options.append(option)
-            else:
-                self._formatted_options.append(self.ptxas_options)
         if self.max_register_count is not None:
             self._formatted_options.append(f"--maxrregcount={self.max_register_count}")
         if self.ftz is not None:
@@ -271,7 +271,9 @@ def __post_init__(self):
         if self.gen_opt_lto is not None and self.gen_opt_lto:
             self._formatted_options.append("--gen-opt-lto")
         if self.define_macro is not None:
-            if is_nested_list_or_tuple(self.define_macro):
+            if isinstance(self.define_macro, str):
+                self._formatted_options.append(f"--define-macro={self.define_macro}")
+            if is_nested_sequence(self.define_macro):
                 for macro in self.define_macro:
                     if isinstance(macro, tuple):
                         assert len(macro) == 2
@@ -281,27 +283,26 @@ def __post_init__(self):
             elif isinstance(self.define_macro, tuple):
                 assert len(self.define_macro) == 2
                 self._formatted_options.append("--define-macro=MY_MACRO=999")
-            else:
-                self._formatted_options.append(f"--define-macro={self.define_macro}")
 
         if self.undefine_macro is not None:
-            if is_list_or_tuple(self.undefine_macro):
+            if isinstance(self.undefine_macro, str):
+                self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
+            elif is_sequence(self.undefine_macro):
                 for macro in self.undefine_macro:
                     self._formatted_options.append(f"--undefine-macro={macro}")
-            else:
-                self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
         if self.include_path is not None:
-            if is_list_or_tuple(self.include_path):
+            if isinstance(self.include_path, str):
+                self._formatted_options.append(f"--include-path={self.include_path}")
+            elif is_sequence(self.include_path):
                 for path in self.include_path:
                     self._formatted_options.append(f"--include-path={path}")
-            else:
-                self._formatted_options.append(f"--include-path={self.include_path}")
         if self.pre_include is not None:
-            if is_list_or_tuple(self.pre_include):
+            if isinstance(self.pre_include, str):
+                self._formatted_options.append(f"--pre-include={self.pre_include}")
+            elif is_sequence(self.pre_include):
                 for header in self.pre_include:
                     self._formatted_options.append(f"--pre-include={header}")
-            else:
-                self._formatted_options.append(f"--pre-include={self.pre_include}")
+
         if self.no_source_include is not None and self.no_source_include:
             self._formatted_options.append("--no-source-include")
         if self.std is not None:
@@ -327,23 +328,23 @@ def __post_init__(self):
         if self.no_display_error_number is not None and self.no_display_error_number:
             self._formatted_options.append("--no-display-error-number")
         if self.diag_error is not None:
-            if is_list_or_tuple(self.diag_error):
+            if isinstance(self.diag_error, int):
+                self._formatted_options.append(f"--diag-error={self.diag_error}")
+            elif is_sequence(self.diag_error):
                 for error in self.diag_error:
                     self._formatted_options.append(f"--diag-error={error}")
-            else:
-                self._formatted_options.append(f"--diag-error={self.diag_error}")
         if self.diag_suppress is not None:
-            if is_list_or_tuple(self.diag_suppress):
+            if isinstance(self.diag_suppress, int):
+                self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
+            elif is_sequence(self.diag_suppress):
                 for suppress in self.diag_suppress:
                     self._formatted_options.append(f"--diag-suppress={suppress}")
-            else:
-                self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
         if self.diag_warn is not None:
-            if is_list_or_tuple(self.diag_warn):
+            if isinstance(self.diag_warn, int):
+                self._formatted_options.append(f"--diag-warn={self.diag_warn}")
+            elif is_sequence(self.diag_warn):
                 for warn in self.diag_warn:
                     self._formatted_options.append(f"--diag-warn={warn}")
-            else:
-                self._formatted_options.append(f"--diag-warn={self.diag_warn}")
         if self.brief_diagnostics is not None:
             self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
         if self.time is not None:
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index 9e64311b..f0bac9d4 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -4,6 +4,7 @@
 
 import functools
 from collections import namedtuple
+from collections.abc import Sequence
 from typing import Callable, Dict
 
 from cuda import cuda, cudart, nvrtc
@@ -143,15 +144,15 @@ def get_device_from_ctx(ctx_handle) -> int:
     return device_id
 
 
-def is_list_or_tuple(obj):
+def is_sequence(obj):
     """
     Check if the given object is a sequence (list or tuple).
     """
-    return isinstance(obj, (list, tuple))
+    return isinstance(obj, Sequence)
 
 
-def is_nested_list_or_tuple(obj):
+def is_nested_sequence(obj):
     """
     Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
     """
-    return is_list_or_tuple(obj) and any(is_list_or_tuple(elem) for elem in obj)
+    return is_sequence(obj) and any(is_sequence(elem) for elem in obj)

From a710b454e6d4fbee583b1592df2d57f6df915b2b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 13 Jan 2025 14:30:56 -0800
Subject: [PATCH 41/44] fix order and debug code

---
 cuda_core/cuda/core/experimental/_program.py | 8 ++++----
 cuda_core/tests/test_program.py              | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 85c46df5..2b8889c4 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -273,16 +273,16 @@ def __post_init__(self):
         if self.define_macro is not None:
             if isinstance(self.define_macro, str):
                 self._formatted_options.append(f"--define-macro={self.define_macro}")
-            if is_nested_sequence(self.define_macro):
+            elif isinstance(self.define_macro, tuple):
+                assert len(self.define_macro) == 2
+                self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}")
+            elif is_nested_sequence(self.define_macro):
                 for macro in self.define_macro:
                     if isinstance(macro, tuple):
                         assert len(macro) == 2
                         self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
                     else:
                         self._formatted_options.append(f"--define-macro={macro}")
-            elif isinstance(self.define_macro, tuple):
-                assert len(self.define_macro) == 2
-                self._formatted_options.append("--define-macro=MY_MACRO=999")
 
         if self.undefine_macro is not None:
             if isinstance(self.undefine_macro, str):
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 0fbd5866..0f9b8e3b 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -37,7 +37,7 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(no_display_error_number=True),
         ProgramOptions(diag_error=1234, diag_suppress=1234),
         ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)),
-        ProgramOptions(diag_warn="1000"),
+        ProgramOptions(diag_warn=1000),
     ]
 
     for options in options_list:

From f4b0f0a96b2d5940477f07979a62e43c4ad2afa0 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 15 Jan 2025 15:04:36 -0800
Subject: [PATCH 42/44] fix ruff

---
 cuda_core/cuda/core/experimental/__init__.py | 2 +-
 cuda_core/cuda/core/experimental/_utils.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 66232928..3db9e8ab 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -6,8 +6,8 @@
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
-from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
 from cuda.core.experimental._system import System
 
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index b0254e31..70d7aae4 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -157,8 +157,8 @@ def is_nested_sequence(obj):
     Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
     """
     return is_sequence(obj) and any(is_sequence(elem) for elem in obj)
-  
-  
+
+
 def get_binding_version():
     try:
         major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2]

From 459cbdbca4f9144fdd8feaebd43dd7f49db361d2 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 17 Jan 2025 12:38:46 -0800
Subject: [PATCH 43/44] lint the merge

---
 cuda_core/cuda/core/experimental/_linker.py  | 1 -
 cuda_core/cuda/core/experimental/_program.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 6ad8220c..b5a6b675 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -9,7 +9,6 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
-
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import check_or_create_options, driver, handle_return
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index aa7d9ead..4f094118 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -6,7 +6,6 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
-
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import (
@@ -15,7 +14,7 @@
     handle_return,
     is_nested_sequence,
     is_sequence,
-    nvrtc
+    nvrtc,
 )
 
 

From 9ef482157aa09359429a15b3d06f1b7a585ea915 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 19 Jan 2025 04:45:19 +0000
Subject: [PATCH 44/44] nits

---
 cuda_core/cuda/core/experimental/_program.py |  6 +++---
 cuda_core/examples/vector_add.py             |  3 ++-
 cuda_core/tests/test_linker.py               | 12 ++++--------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 4f094118..c10fd077 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -403,6 +403,8 @@ def close(self):
     def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None)
 
+        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
+
         if code_type not in self._supported_code_type:
             raise NotImplementedError
 
@@ -416,8 +418,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         else:
             raise NotImplementedError
 
-        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
-
     def close(self):
         """Destroy this program."""
         self._mnff.close()
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 5a42ab7b..ec398209 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -26,7 +26,8 @@
 s = dev.create_stream()
 
 # prepare program
-program_options = ProgramOptions(std="c++17", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+arch = "".join(f"{i}" for i in dev.compute_capability)
+program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
 prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 37424d7b..b81c1654 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -86,14 +86,10 @@ def test_linker_init(compile_ptx_functions, options):
 
 
 def test_linker_init_invalid_arch(compile_ptx_functions):
-    if culink_backend:
-        with pytest.raises(AttributeError):
-            options = LinkerOptions(arch="99", ptx=True)
-            Linker(*compile_ptx_functions, options=options)
-    else:
-        with pytest.raises(nvjitlink.nvJitLinkError):
-            options = LinkerOptions(arch="99", ptx=True)
-            Linker(*compile_ptx_functions, options=options)
+    err = AttributeError if culink_backend else nvjitlink.nvJitLinkError
+    with pytest.raises(err):
+        options = LinkerOptions(arch="99", ptx=True)
+        Linker(*compile_ptx_functions, options=options)
 
 
 @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")