diff --git a/.gitignore b/.gitignore
index 0f7ad5be..1c73b74f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ cuda_bindings/cuda/bindings/_bindings/cydriver.pxd
 cuda_bindings/cuda/bindings/_bindings/cydriver.pyx
 cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd
 cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx
+cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
 cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pxd
 cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pyx
 cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pxd
diff --git a/cuda_bindings/MANIFEST.in b/cuda_bindings/MANIFEST.in
index ef9def92..88bab329 100644
--- a/cuda_bindings/MANIFEST.in
+++ b/cuda_bindings/MANIFEST.in
@@ -1,4 +1,4 @@
 recursive-include cuda/ *.pyx *.pxd
 # at least with setuptools 75.0.0 this folder was added erroneously
 # to the payload, causing file copying to the build environment failed
-exclude cuda/bindings
+exclude cuda/bindings cuda?bindings
diff --git a/cuda_bindings/cuda/ccuda.pyx b/cuda_bindings/cuda/ccuda.pyx
index 73f3fc5c..a07525bb 100644
--- a/cuda_bindings/cuda/ccuda.pyx
+++ b/cuda_bindings/cuda/ccuda.pyx
@@ -5,3 +5,8 @@ cdef extern from *:
     #pragma message ( "The cuda.ccuda module is deprecated and will be removed in a future release, " \
                       "please switch to use the cuda.bindings.cydriver module instead." )
     """
+
+
+from cuda.bindings import cydriver
+__pyx_capi__ = cydriver.__pyx_capi__
+del cydriver
diff --git a/cuda_bindings/cuda/ccudart.pyx b/cuda_bindings/cuda/ccudart.pyx
index b32eece8..3cef0811 100644
--- a/cuda_bindings/cuda/ccudart.pyx
+++ b/cuda_bindings/cuda/ccudart.pyx
@@ -5,3 +5,8 @@ cdef extern from *:
     #pragma message ( "The cuda.ccudart module is deprecated and will be removed in a future release, " \
                       "please switch to use the cuda.bindings.cyruntime module instead." )
     """
+
+
+from cuda.bindings import cyruntime
+__pyx_capi__ = cyruntime.__pyx_capi__
+del cyruntime
diff --git a/cuda_bindings/cuda/cnvrtc.pyx b/cuda_bindings/cuda/cnvrtc.pyx
index d4034084..405d7de0 100644
--- a/cuda_bindings/cuda/cnvrtc.pyx
+++ b/cuda_bindings/cuda/cnvrtc.pyx
@@ -5,3 +5,8 @@ cdef extern from *:
     #pragma message ( "The cuda.cnvrtc module is deprecated and will be removed in a future release, " \
                       "please switch to use the cuda.bindings.cynvrtc module instead." )
     """
+
+
+from cuda.bindings import cynvrtc
+__pyx_capi__ = cynvrtc.__pyx_capi__
+del cynvrtc
diff --git a/cuda_bindings/docs/source/release.md b/cuda_bindings/docs/source/release.md
index c3ae5a30..69f361a2 100644
--- a/cuda_bindings/docs/source/release.md
+++ b/cuda_bindings/docs/source/release.md
@@ -5,6 +5,7 @@
 maxdepth: 3
 ---
 
+    12.6.2 <release/12.6.2-notes>
     12.6.1 <release/12.6.1-notes>
     12.6.0 <release/12.6.0-notes>
     12.5.0 <release/12.5.0-notes>
@@ -14,6 +15,7 @@ maxdepth: 3
     12.2.0 <release/12.2.0-notes>
     12.1.0 <release/12.1.0-notes>
     12.0.0 <release/12.0.0-notes>
+    11.8.5 <release/11.8.5-notes>
     11.8.4 <release/11.8.4-notes>
     11.8.3 <release/11.8.3-notes>
     11.8.2 <release/11.8.2-notes>
diff --git a/cuda_bindings/docs/source/release/11.4.0-notes.md b/cuda_bindings/docs/source/release/11.4.0-notes.md
index f7611688..9eaa4eff 100644
--- a/cuda_bindings/docs/source/release/11.4.0-notes.md
+++ b/cuda_bindings/docs/source/release/11.4.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on August 16, 2021
 
-## Hightlights
+## Highlights
 - Initial EA release for CUDA Python
 - Supports all platforms that CUDA is supported
 - Supports all CUDA 11.x releases
diff --git a/cuda_bindings/docs/source/release/11.5.0-notes.md b/cuda_bindings/docs/source/release/11.5.0-notes.md
index a7f8fddc..130cb17d 100644
--- a/cuda_bindings/docs/source/release/11.5.0-notes.md
+++ b/cuda_bindings/docs/source/release/11.5.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on October 18, 2021
 
-## Hightlights
+## Highlights
 - PyPi support
 - Conda support
 - GA release for CUDA Python
diff --git a/cuda_bindings/docs/source/release/11.6.0-notes.md b/cuda_bindings/docs/source/release/11.6.0-notes.md
index 60a9d920..664da162 100644
--- a/cuda_bindings/docs/source/release/11.6.0-notes.md
+++ b/cuda_bindings/docs/source/release/11.6.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on Januray 12, 2022
 
-## Hightlights
+## Highlights
 - Support CUDA Toolkit 11.6
 - Support Profiler APIs
 - Support Graphic APIs (EGL, GL, VDPAU)
diff --git a/cuda_bindings/docs/source/release/11.6.1-notes.md b/cuda_bindings/docs/source/release/11.6.1-notes.md
index bc2ba329..ddd6ff51 100644
--- a/cuda_bindings/docs/source/release/11.6.1-notes.md
+++ b/cuda_bindings/docs/source/release/11.6.1-notes.md
@@ -2,7 +2,7 @@
 
 Released on March 18, 2022
 
-## Hightlights
+## Highlights
 - Fix string decomposition for WSL library load
 
 ## Limitations
diff --git a/cuda_bindings/docs/source/release/11.7.0-notes.md b/cuda_bindings/docs/source/release/11.7.0-notes.md
index 91ab215e..22500c7a 100644
--- a/cuda_bindings/docs/source/release/11.7.0-notes.md
+++ b/cuda_bindings/docs/source/release/11.7.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on May 11, 2022
 
-## Hightlights
+## Highlights
 - Support CUDA Toolkit 11.7
 
 ## Limitations
diff --git a/cuda_bindings/docs/source/release/11.7.1-notes.md b/cuda_bindings/docs/source/release/11.7.1-notes.md
index 8d07b19d..2997c9da 100644
--- a/cuda_bindings/docs/source/release/11.7.1-notes.md
+++ b/cuda_bindings/docs/source/release/11.7.1-notes.md
@@ -2,7 +2,7 @@
 
 Released on June 29, 2022
 
-## Hightlights
+## Highlights
 - Fix error propagation in CUDA Runtime bindings
 - Resolves [issue #22](https://github.com/NVIDIA/cuda-python/issues/22)
 
diff --git a/cuda_bindings/docs/source/release/11.8.0-notes.md b/cuda_bindings/docs/source/release/11.8.0-notes.md
index f860e5fb..c5bf9f71 100644
--- a/cuda_bindings/docs/source/release/11.8.0-notes.md
+++ b/cuda_bindings/docs/source/release/11.8.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on October 3, 2022
 
-## Hightlights
+## Highlights
 - Support CUDA Toolkit 11.8
 - Source builds allow for missing types and APIs
 - Resolves source builds for mobile platforms
diff --git a/cuda_bindings/docs/source/release/11.8.1-notes.md b/cuda_bindings/docs/source/release/11.8.1-notes.md
index 94565355..f7c2e7d4 100644
--- a/cuda_bindings/docs/source/release/11.8.1-notes.md
+++ b/cuda_bindings/docs/source/release/11.8.1-notes.md
@@ -2,7 +2,7 @@
 
 Released on November 4, 2022
 
-## Hightlights
+## Highlights
 - Resolves [issue #27](https://github.com/NVIDIA/cuda-python/issues/27)
 - Update install instructions to use latest CTK
 
diff --git a/cuda_bindings/docs/source/release/11.8.2-notes.md b/cuda_bindings/docs/source/release/11.8.2-notes.md
index 84d781b5..f9d16556 100644
--- a/cuda_bindings/docs/source/release/11.8.2-notes.md
+++ b/cuda_bindings/docs/source/release/11.8.2-notes.md
@@ -2,7 +2,7 @@
 
 Released on May 18, 2023
 
-## Hightlights
+## Highlights
 - Open libcuda.so.1 instead of libcuda.so
 
 ## Limitations
diff --git a/cuda_bindings/docs/source/release/11.8.3-notes.md b/cuda_bindings/docs/source/release/11.8.3-notes.md
index 91bbc491..a8ff840c 100644
--- a/cuda_bindings/docs/source/release/11.8.3-notes.md
+++ b/cuda_bindings/docs/source/release/11.8.3-notes.md
@@ -2,7 +2,7 @@
 
 Released on October 23, 2023
 
-## Hightlights
+## Highlights
 - Compatability with Cython 3
 - New API cudart.getLocalRuntimeVersion()
 - Modernize build config
diff --git a/cuda_bindings/docs/source/release/11.8.4-notes.md b/cuda_bindings/docs/source/release/11.8.4-notes.md
index 9cae2915..13767998 100644
--- a/cuda_bindings/docs/source/release/11.8.4-notes.md
+++ b/cuda_bindings/docs/source/release/11.8.4-notes.md
@@ -2,12 +2,34 @@
 
 Released on October 7, 2024
 
-## Hightlights
+## Highlights
 - Resolve [Issue #89](https://github.com/NVIDIA/cuda-python/issues/89): Fix getLocalRuntimeVersion searching for wrong libcudart version
 - Resolve [Issue #90](https://github.com/NVIDIA/cuda-python/issues/90): Use new layout in preperation for cuda-python becoming a metapackage
 
+## CUDA namespace cleanup with a new module layout
+
+[Issue #75](https://github.com/NVIDIA/cuda-python/issues/75) explains in detail what the new module layout is, what problem it fixes and how it impacts the users. However for the sake of completeness, this release notes will highlight key points of this change.
+
+Before this change, `cuda-python` was tightly coupled to CUDA Toolkit releases and all new features would inherit this coupling regardless of their applicability. As we develop new features, this coupling was becoming overly restrictive and motivated a new solution: Convert `cuda-python` into a metapackage where we use `cuda` as a namespace with existing bindings code moved to a `cuda_bindings` subpackage.
+
+This patch release applies the new module layout for the bindings as follows:
+- `cuda.cuda` -> `cuda.bindings.driver`
+- `cuda.ccuda` -> `cuda.bindings.cydriver`
+- `cuda.cudart` -> `cuda.bindings.runtime`
+- `cuda.ccudart` -> `cuda.bindings.cyruntime`
+- `cuda.nvrtc` -> `cuda.bindings.nvrtc`
+- `cuda.cnvrtc` -> `cuda.bindings.cynvrtc`
+
+Deprecation warnings are turned on as a notice to switch to the new module layout.
+
+```{note} This is non-breaking, backwards compatible change. All old module path will continue work as they "forward" user calls towards the new layout.
+```
+
 ## Limitations
 
+### Know issues
+- [Issue #215](https://github.com/NVIDIA/cuda-python/issues/215)
+
 ### CUDA Functions Not Supported in this Release
 
 - Symbol APIs
diff --git a/cuda_bindings/docs/source/release/11.8.5-notes.md b/cuda_bindings/docs/source/release/11.8.5-notes.md
new file mode 100644
index 00000000..44616459
--- /dev/null
+++ b/cuda_bindings/docs/source/release/11.8.5-notes.md
@@ -0,0 +1,31 @@
+# CUDA Python 11.8.5 Release notes
+
+Released on November 5, 2024
+
+## Highlights
+- Resolve [Issue #215](https://github.com/NVIDIA/cuda-python/issues/215): module 'cuda.ccudart' has no attribute '__pyx_capi__'
+
+## Limitations
+
+### CUDA Functions Not Supported in this Release
+
+- Symbol APIs
+    - cudaGraphExecMemcpyNodeSetParamsFromSymbol
+    - cudaGraphExecMemcpyNodeSetParamsToSymbol
+    - cudaGraphAddMemcpyNodeToSymbol
+    - cudaGraphAddMemcpyNodeFromSymbol
+    - cudaGraphMemcpyNodeSetParamsToSymbol
+    - cudaGraphMemcpyNodeSetParamsFromSymbol
+    - cudaMemcpyToSymbol
+    - cudaMemcpyFromSymbol
+    - cudaMemcpyToSymbolAsync
+    - cudaMemcpyFromSymbolAsync
+    - cudaGetSymbolAddress
+    - cudaGetSymbolSize
+    - cudaGetFuncBySymbol
+- Launch Options
+    - cudaLaunchKernel
+    - cudaLaunchCooperativeKernel
+    - cudaLaunchCooperativeKernelMultiDevice
+- cudaSetValidDevices
+- cudaVDPAUSetVDPAUDevice
diff --git a/cuda_bindings/docs/source/release/12.0.0-notes.md b/cuda_bindings/docs/source/release/12.0.0-notes.md
index df1bf1f9..9f2ae258 100644
--- a/cuda_bindings/docs/source/release/12.0.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.0.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on December 8, 2022
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.0
 - Fix example from [MR28](https://github.com/NVIDIA/cuda-python/pull/28)
 - Apply [MR35](https://github.com/NVIDIA/cuda-python/pull/35)
diff --git a/cuda_bindings/docs/source/release/12.1.0-notes.md b/cuda_bindings/docs/source/release/12.1.0-notes.md
index aec56999..94310bb5 100644
--- a/cuda_bindings/docs/source/release/12.1.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.1.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on February 28, 2023
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.1
 - Resolve [Issue #41](https://github.com/NVIDIA/cuda-python/issues/41): Add support for Python 3.11
 - Resolve [Issue #42](https://github.com/NVIDIA/cuda-python/issues/42): Dropping Python 3.7
diff --git a/cuda_bindings/docs/source/release/12.2.0-notes.md b/cuda_bindings/docs/source/release/12.2.0-notes.md
index d6bd6675..39e37b9a 100644
--- a/cuda_bindings/docs/source/release/12.2.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.2.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on June 28, 2023
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.2
 - Resolve [Issue #44](https://github.com/NVIDIA/cuda-python/issues/44): nogil must be at the end of the function signature line
 - Resolve [Issue #45](https://github.com/NVIDIA/cuda-python/issues/45): Error with pyparsing when no CUDA is found
diff --git a/cuda_bindings/docs/source/release/12.2.1-notes.md b/cuda_bindings/docs/source/release/12.2.1-notes.md
index 41704a56..3a89af85 100644
--- a/cuda_bindings/docs/source/release/12.2.1-notes.md
+++ b/cuda_bindings/docs/source/release/12.2.1-notes.md
@@ -2,7 +2,7 @@
 
 Released on January 8, 2024
 
-## Hightlights
+## Highlights
 - Compatibility with Cython 3
 
 ## Limitations
diff --git a/cuda_bindings/docs/source/release/12.3.0-notes.md b/cuda_bindings/docs/source/release/12.3.0-notes.md
index 016ee0de..15bcdb97 100644
--- a/cuda_bindings/docs/source/release/12.3.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.3.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on October 19, 2023
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.3
 - Resolve [Issue #16](https://github.com/NVIDIA/cuda-python/issues/16): cuda.cudart.cudaRuntimeGetVersion() hard-codes the runtime version, rather than querying the runtime
     - New API cudart.getLocalRuntimeVersion()
diff --git a/cuda_bindings/docs/source/release/12.4.0-notes.md b/cuda_bindings/docs/source/release/12.4.0-notes.md
index 6daedb20..191ecc64 100644
--- a/cuda_bindings/docs/source/release/12.4.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.4.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on March 5, 2024
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.4
 - Add PyPI/Conda support for Python 12
 
diff --git a/cuda_bindings/docs/source/release/12.5.0-notes.md b/cuda_bindings/docs/source/release/12.5.0-notes.md
index 701f0320..b0e527a8 100644
--- a/cuda_bindings/docs/source/release/12.5.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.5.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on May 21, 2024
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.5
 - Resolve [Issue #58](https://github.com/NVIDIA/cuda-python/issues/58): Interop between CUdeviceptr and Runtime
 
diff --git a/cuda_bindings/docs/source/release/12.6.0-notes.md b/cuda_bindings/docs/source/release/12.6.0-notes.md
index 2531e89b..466e2eec 100644
--- a/cuda_bindings/docs/source/release/12.6.0-notes.md
+++ b/cuda_bindings/docs/source/release/12.6.0-notes.md
@@ -2,7 +2,7 @@
 
 Released on August 1, 2024
 
-## Hightlights
+## Highlights
 - Rebase to CUDA Toolkit 12.6
 - Resolve [Issue #32](https://github.com/NVIDIA/cuda-python/issues/32): Add 'pywin32' as Windows requirement
 - Resolve [Issue #72](https://github.com/NVIDIA/cuda-python/issues/72): Allow both lists and tuples as parameter
diff --git a/cuda_bindings/docs/source/release/12.6.1-notes.md b/cuda_bindings/docs/source/release/12.6.1-notes.md
index bf196213..36004712 100644
--- a/cuda_bindings/docs/source/release/12.6.1-notes.md
+++ b/cuda_bindings/docs/source/release/12.6.1-notes.md
@@ -2,7 +2,7 @@
 
 Released on October 7, 2024
 
-## Hightlights
+## Highlights
 - Resolve [Issue #90](https://github.com/NVIDIA/cuda-python/issues/90): Use new layout in preparation for cuda-python becoming a metapackage
 - Resolve [Issue #75](https://github.com/NVIDIA/cuda-python/issues/75): CUDA namespace cleanup
 
@@ -27,6 +27,9 @@ Deprecation warnings are turned on as a notice to switch to the new module layou
 
 ## Limitations
 
+### Know issues
+- [Issue #215](https://github.com/NVIDIA/cuda-python/issues/215)
+
 ### CUDA Functions Not Supported in this Release
 
 - Symbol APIs
diff --git a/cuda_bindings/docs/source/release/12.6.2-notes.md b/cuda_bindings/docs/source/release/12.6.2-notes.md
new file mode 100644
index 00000000..06fe110b
--- /dev/null
+++ b/cuda_bindings/docs/source/release/12.6.2-notes.md
@@ -0,0 +1,33 @@
+# CUDA Python 12.6.2 Release notes
+
+Released on November 5, 2024
+
+## Highlights
+- Resolve [Issue #215](https://github.com/NVIDIA/cuda-python/issues/215): module 'cuda.ccudart' has no attribute '__pyx_capi__'
+
+## Limitations
+
+### CUDA Functions Not Supported in this Release
+
+- Symbol APIs
+    - cudaGraphExecMemcpyNodeSetParamsFromSymbol
+    - cudaGraphExecMemcpyNodeSetParamsToSymbol
+    - cudaGraphAddMemcpyNodeToSymbol
+    - cudaGraphAddMemcpyNodeFromSymbol
+    - cudaGraphMemcpyNodeSetParamsToSymbol
+    - cudaGraphMemcpyNodeSetParamsFromSymbol
+    - cudaMemcpyToSymbol
+    - cudaMemcpyFromSymbol
+    - cudaMemcpyToSymbolAsync
+    - cudaMemcpyFromSymbolAsync
+    - cudaGetSymbolAddress
+    - cudaGetSymbolSize
+    - cudaGetFuncBySymbol
+- Launch Options
+    - cudaLaunchKernel
+    - cudaLaunchCooperativeKernel
+    - cudaLaunchCooperativeKernelMultiDevice
+- cudaSetValidDevices
+- cudaVDPAUSetVDPAUDevice
+- cudaFuncGetName
+- cudaFuncGetParamInfo
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index f4c9c5bc..63c09db5 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -36,18 +36,8 @@ dependencies = [
 Repository = "https://github.com/NVIDIA/cuda-python"
 Documentation = "https://nvidia.github.io/cuda-python/"
 
-# BETA
-# [tool.setuptools]
-# zip-safe = false
-
-# BETA
-# [tool.setuptools.packages.find]
-# where = ["cuda"]
-# include = ["cuda", "cuda.*"]
-
-# BETA
-# [tool.setuptools.package-data]
-# "*" = ["*.pxd", "*.pyx", "*.h", "*.cpp"]
+[tool.setuptools.packages.find]
+include = ["cuda*"]
 
 [tool.versioneer]
 VCS = "git"
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index ca1f8264..2342e62a 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -269,8 +269,6 @@ def do_cythonize(extensions):
     ["cuda/bindings/*.pyx"],
     # public (deprecated, to be removed)
     ["cuda/*.pyx"],
-    # tests
-    ["tests/*.pyx"],
     # interal files used by generated bindings
     ['cuda/bindings/_internal/nvjitlink.pyx'],
     ['cuda/bindings/_internal/utils.pyx'],
@@ -304,11 +302,6 @@ def finalize_options(self):
 setup(
     version=versioneer.get_version(),
     ext_modules=do_cythonize(extensions),
-    packages=find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
-    package_data=dict.fromkeys(
-        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
-        ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
-    ),
     cmdclass=cmdclass,
     zip_safe=False,
 )
diff --git a/cuda_bindings/tests/test_ccuda.pyx b/cuda_bindings/tests_cython/test_ccuda.pyx
similarity index 100%
rename from cuda_bindings/tests/test_ccuda.pyx
rename to cuda_bindings/tests_cython/test_ccuda.pyx
diff --git a/cuda_bindings/tests/test_ccudart.pyx b/cuda_bindings/tests_cython/test_ccudart.pyx
similarity index 100%
rename from cuda_bindings/tests/test_ccudart.pyx
rename to cuda_bindings/tests_cython/test_ccudart.pyx
diff --git a/cuda_bindings/tests/test_cython.py b/cuda_bindings/tests_cython/test_cython.py
similarity index 100%
rename from cuda_bindings/tests/test_cython.py
rename to cuda_bindings/tests_cython/test_cython.py
diff --git a/cuda_bindings/tests/test_interoperability_cython.pyx b/cuda_bindings/tests_cython/test_interoperability_cython.pyx
similarity index 100%
rename from cuda_bindings/tests/test_interoperability_cython.pyx
rename to cuda_bindings/tests_cython/test_interoperability_cython.pyx
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 65d5fe9b..2899282c 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -19,7 +19,31 @@
 
 
 class Device:
+    """Represent a GPU and act as an entry point for cuda.core features.
 
+    This is a singleton object that helps ensure interoperability
+    across multiple libraries imported in the process to both see
+    and use the same GPU device.
+
+    While acting as the entry point, many other CUDA resources can be
+    allocated such as streams and buffers. Any :obj:`Context` dependent
+    resource created through this device, will continue to refer to
+    this device's context.
+
+    Newly returend :obj:`Device` object are is a thread-local singleton
+    for a specified device.
+
+    Note
+    ----
+    Will not initialize the GPU.
+
+    Parameters
+    ----------
+    device_id : int, optional
+        Device ordinal to return a :obj:`Device` object for.
+        Default value of `None` return the currently used device.
+
+    """
     __slots__ = ("_id", "_mr", "_has_inited")
 
     def __new__(cls, device_id=None):
@@ -54,15 +78,29 @@ def _check_context_initialized(self, *args, **kwargs):
 
     @property
     def device_id(self) -> int:
+        """Return device ordinal."""
         return self._id
 
     @property
     def pci_bus_id(self) -> str:
+        """Return a PCI Bus Id string for this device."""
         bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
         return bus_id[:12].decode()
 
     @property
     def uuid(self) -> str:
+        """Return a UUID for the device.
+
+        Returns 16-octets identifying the device. If the device is in
+        MIG mode, returns its MIG UUID which uniquely identifies the
+        subscribed MIG compute instance.
+
+        Note
+        ----
+        MIG UUID is only returned when device is in MIG mode and the
+        driver is older than CUDA 11.4.
+
+        """
         driver_ver = handle_return(cuda.cuDriverGetVersion())
         if driver_ver >= 11040:
             uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
@@ -74,19 +112,21 @@ def uuid(self) -> str:
 
     @property
     def name(self) -> str:
-        # assuming a GPU name is less than 128 characters...
-        name = handle_return(cuda.cuDeviceGetName(128, self._id))
+        """Return the device name."""
+        # Use 256 characters to be consistent with CUDA Runtime
+        name = handle_return(cuda.cuDeviceGetName(256, self._id))
         name = name.split(b'\0')[0]
         return name.decode()
 
     @property
     def properties(self) -> dict:
+        """Return information about the compute-device."""
         # TODO: pythonize the key names
         return handle_return(cudart.cudaGetDeviceProperties(self._id))
 
     @property
     def compute_capability(self) -> ComputeCapability:
-        """Returns a named tuple with 2 fields: major and minor. """
+        """Return a named tuple with 2 fields: major and minor."""
         major = handle_return(cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
         minor = handle_return(cudart.cudaDeviceGetAttribute(
@@ -96,12 +136,20 @@ def compute_capability(self) -> ComputeCapability:
     @property
     @precondition(_check_context_initialized)
     def context(self) -> Context:
+        """Return the current :obj:`Context` associated with this device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        """
         ctx = handle_return(cuda.cuCtxGetCurrent())
         assert int(ctx) != 0
         return Context._from_ctx(ctx, self._id)
 
     @property
     def memory_resource(self) -> MemoryResource:
+        """Return :obj:`MemoryResource` associated with this device."""
         return self._mr
 
     @memory_resource.setter
@@ -112,27 +160,53 @@ def memory_resource(self, mr):
 
     @property
     def default_stream(self) -> Stream:
+        """Return default CUDA :obj:`Stream` associated with this device.
+
+        The type of default stream returned depends on if the environment
+        variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
+
+        If set, returns a per-thread default stream. Otherwise returns
+        the legacy stream.
+
+        """
         return default_stream()
 
     def __int__(self):
+        """Return device_id."""
         return self._id
 
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
     def set_current(self, ctx: Context=None) -> Union[Context, None]:
-        """
-        Entry point of this object. Users always start a code by
+        """Set device to be used for GPU executions.
+
+        Initializes CUDA and sets the calling thread to a valid CUDA
+        context. By default the primary context is used, but optional `ctx`
+        parameter can be used to explicitly supply a :obj:`Context` object.
+
+        Providing a `ctx` causes the previous set context to be popped and returned.
+
+        Parameters
+        ----------
+        ctx : :obj:`Context`, optional
+            Optional context to push onto this device's current thread stack.
+
+        Returns
+        -------
+        Union[:obj:`Context`, None], optional
+            Popped context.
+
+        Examples
+        --------
+        Acts as an entry point of this object. Users always start a code by
         calling this method, e.g.
-        
+
         >>> from cuda.core.experimental import Device
         >>> dev0 = Device(0)
         >>> dev0.set_current()
         >>> # ... do work on device 0 ...
-        
-        The optional ctx argument is for advanced users to bind a
-        CUDA context with the device. In this case, the previously
-        set context is popped and returned to the user.
+
         """
         if ctx is not None:
             if not isinstance(ctx, Context):
@@ -163,25 +237,94 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]:
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
-        # Create a Context object (but do NOT set it current yet!).
-        # ContextOptions is a dataclass for setting e.g. affinity or CIG
-        # options. 
+        """Create a new :obj:`Context` object.
+
+        Note
+        ----
+        The newly context will not be set as current.
+
+        Parameters
+        ----------
+        options : :obj:`ContextOptions`, optional
+            Customizable dataclass for context creation options.
+
+        Returns
+        -------
+        :obj:`Context`
+            Newly created context object.
+
+        """
         raise NotImplementedError("TODO")
 
     @precondition(_check_context_initialized)
     def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
-        # Create a Stream object by either holding a newly created
-        # CUDA stream or wrapping an existing foreign object supporting
-        # the __cuda_stream__ protocol. In the latter case, a reference
-        # to obj is held internally so that its lifetime is managed.
+        """Create a Stream object.
+
+        New stream objects can be created in two different ways:
+
+        1) Create a new CUDA stream with customizable `options`.
+        2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol.
+
+        Option (2) internally holds a reference to the foreign object
+        such that the lifetime is managed.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        obj : Any, optional
+            Any object supporting the __cuda_stream__ protocol.
+        options : :obj:`StreamOptions`, optional
+            Customizable dataclass for stream creation options.
+
+        Returns
+        -------
+        :obj:`Stream`
+            Newly created stream object.
+
+        """
         return Stream._init(obj=obj, options=options)
 
     @precondition(_check_context_initialized)
     def allocate(self, size, stream=None) -> Buffer:
+        """Allocate device memory from a specified stream.
+
+        Allocates device memory of `size` bytes on the specified `stream`
+        using the memory resource currently associated with this Device.
+
+        Parameter `stream` is optional, using a default stream by default.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        size : int
+            Number of bytes to allocate.
+        stream : :obj:`Stream`, optional
+            The stream establishing the stream ordering semantic.
+            Default value of `None` uses default stream.
+
+        Returns
+        -------
+        :obj:`Buffer`
+            Newly created buffer object.
+
+        """
         if stream is None:
             stream = default_stream()
         return self._mr.allocate(size, stream)
 
     @precondition(_check_context_initialized)
     def sync(self):
+        """Synchronize the device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        """
         handle_return(cudart.cudaDeviceSynchronize())
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 3c85d9fe..a6d5da28 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -13,17 +13,46 @@
 
 @dataclass
 class EventOptions:
+    """Customizable :obj:`Event` options.
+
+    Attributes
+    ----------
+    enable_timing : bool, optional
+        Event will record timing data. (Default to False)
+    busy_waited_sync : bool, optional
+        If True, event will use blocking synchronization. When a CPU
+        thread calls synchronize, the call will block until the event
+        has actually been completed.
+        Otherwise, the CPU thread will busy-wait until the event has
+        been completed. (Default to False)
+    support_ipc : bool, optional
+        Event will be suitable for interprocess use.
+        Note that enable_timing must be False. (Default to False)
+
+    """
     enable_timing: Optional[bool] = False
     busy_waited_sync: Optional[bool] = False
     support_ipc: Optional[bool] = False
 
 
 class Event:
+    """Represent a record at a specific point of execution within a CUDA stream.
 
+    Applications can asynchronously record events at any point in
+    the program. An event keeps a record of all previous work within
+    the last recorded stream.
+
+    Events can be used to monitor device's progress, query completion
+    of work up to event's record, and help establish dependencies
+    between GPU work submissions.
+
+    Directly creating an :obj:`Event` is not supported due to ambiguity,
+    and they should instead be created through a :obj:`Stream` object.
+
+    """
     __slots__ = ("_handle", "_timing_disabled", "_busy_waited")
 
     def __init__(self):
-        # minimal requirements for the destructor
         self._handle = None
         raise NotImplementedError(
             "directly creating an Event object can be ambiguous. Please call "
@@ -51,37 +80,45 @@ def _init(options: Optional[EventOptions]=None):
         return self
 
     def __del__(self):
+        """Return close(self)"""
         self.close()
 
     def close(self):
-        # Destroy the event.
+        """Destroy the event."""
         if self._handle:
             handle_return(cuda.cuEventDestroy(self._handle))
             self._handle = None
 
     @property
     def is_timing_disabled(self) -> bool:
-        # Check if this instance can be used for the timing purpose.
+        """Return True if the event does not record timing data, otherwise False."""
         return self._timing_disabled
 
     @property
     def is_sync_busy_waited(self) -> bool:
-        # Check if the event synchronization would keep the CPU busy-waiting.
+        """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False."""
         return self._busy_waited
 
     @property
     def is_ipc_supported(self) -> bool:
-        # Check if this instance can be used for IPC.
+        """Return True if this event can be used as an interprocess event, otherwise False."""
         raise NotImplementedError("TODO")
 
     def sync(self):
-        # Sync over the event.
+        """Synchronize until the event completes.
+
+        If the event was created with busy_waited_sync, then the
+        calling CPU thread will block until the event has been
+        completed by the device.
+        Otherwise the CPU thread will busy-wait until the event
+        has been completed.
+
+        """
         handle_return(cuda.cuEventSynchronize(self._handle))
 
     @property
     def is_done(self) -> bool:
-        # Return True if all captured works have been completed,
-        # otherwise False.
+        """Return True if all captured works have been completed, otherwise False."""
         result, = cuda.cuEventQuery(self._handle)
         if result == cuda.CUresult.CUDA_SUCCESS:
             return True
@@ -92,4 +129,5 @@ def is_done(self) -> bool:
 
     @property
     def handle(self) -> int:
+        """Return the underlying cudaEvent_t pointer address as Python int."""
         return int(self._handle)
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index 4b9533cb..9991638f 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -37,7 +37,23 @@ def _lazy_init():
 
 @dataclass
 class LaunchConfig:
-    """
+    """Customizable launch options.
+
+    Attributes
+    ----------
+    grid : Union[tuple, int]
+        Collection of threads that will execute a kernel function.
+    block : Union[tuple, int]
+        Group of threads (Thread Block) that will execute on the same
+        multiprocessor. Threads within a thread blocks have access to
+        shared memory and can be explicitly synchronized.
+    stream : :obj:`Stream`
+        The stream establishing the stream ordering semantic of a
+        launch.
+    shmem_size : int, optional
+        Dynamic shared-memory size per thread block in bytes.
+        (Default to size 0)
+
     """
     # TODO: expand LaunchConfig to include other attributes
     grid: Union[tuple, int] = None
@@ -87,6 +103,21 @@ def _cast_to_3_tuple(self, cfg):
 
 
 def launch(kernel, config, *kernel_args):
+    """Launches a :obj:`~cuda.core.experimental._module.Kernel`
+    object with launch-time configuration.
+
+    Parameters
+    ----------
+    kernel : :obj:`~cuda.core.experimental._module.Kernel`
+        Kernel to launch.
+    config : :obj:`LaunchConfig`
+        Launch configurations inline with options provided by
+        :obj:`LaunchConfig` dataclass.
+    *kernel_args : Any
+        Variable length argument list that is provided to the
+        launching kernel.
+
+    """
     if not isinstance(kernel, Kernel):
         raise ValueError
     config = check_or_create_options(LaunchConfig, config, "launch config")
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 4ef2cbc3..678f26ee 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -22,6 +22,26 @@
 
 
 class Buffer:
+    """Represent a handle to allocated memory.
+
+    This generic object provides a unified representation for how
+    different memory resources are to give access to their memory
+    allocations.
+
+    Support for data interchange mechanisms are provided by
+    establishing both the DLPack and the Python-level buffer
+    protocols.
+
+    Parameters
+    ----------
+    ptr : Any
+        Allocated buffer handle object
+    size : Any
+        Memory size of the buffer
+    mr : :obj:`MemoryResource`, optional
+        Memory resource associated with the buffer
+
+    """
 
     # TODO: handle ownership? (_mr could be None)
     __slots__ = ("_ptr", "_size", "_mr",)
@@ -32,9 +52,23 @@ def __init__(self, ptr, size, mr: MemoryResource=None):
         self._mr = mr
 
     def __del__(self):
-        self.close(default_stream())
+        """Return close(self)."""
+        self.close()
 
     def close(self, stream=None):
+        """Deallocate this buffer asynchronously on the given stream.
+
+        This buffer is released back to their memory resource
+        asynchronously on the given stream.
+
+        Parameters
+        ----------
+        stream : Any, optional
+            The stream object with a __cuda_stream__ protocol to
+            use for asynchronous deallocation. Defaults to using
+            the default stream.
+
+        """
         if self._ptr and self._mr is not None:
             if stream is None:
                 stream = default_stream()
@@ -44,42 +78,56 @@ def close(self, stream=None):
 
     @property
     def handle(self):
+        """Return the buffer handle object."""
         return self._ptr
 
     @property
     def size(self):
+        """Return the memory size of this buffer."""
         return self._size
 
     @property
     def memory_resource(self) -> MemoryResource:
-        # Return the memory resource from which this buffer was allocated.
+        """Return the memory resource associated with this buffer."""
         return self._mr
 
     @property
     def is_device_accessible(self) -> bool:
-        # Check if this buffer can be accessed from GPUs.
+        """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._mr is not None:
             return self._mr.is_device_accessible
         raise NotImplementedError
 
     @property
     def is_host_accessible(self) -> bool:
-        # Check if this buffer can be accessed from CPUs.
+        """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._mr is not None:
             return self._mr.is_host_accessible
         raise NotImplementedError
 
     @property
     def device_id(self) -> int:
+        """Return the device ordinal of this buffer."""
         if self._mr is not None:
             return self._mr.device_id
         raise NotImplementedError
 
     def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
-        # Copy from this buffer to the dst buffer asynchronously on the
-        # given stream. The dst buffer is returned. If the dst is not provided,
-        # allocate one from self.memory_resource. Raise an exception if the
-        # stream is not provided.
+        """Copy from this buffer to the dst buffer asynchronously on the given stream.
+
+        Copies the data from this buffer to the provided dst buffer.
+        If the dst buffer is not provided, then a new buffer is first
+        allocated using the associated memory resource before the copy.
+
+        Parameters
+        ----------
+        dst : :obj:`Buffer`
+            Source buffer to copy data from
+        stream : Any
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
         if stream is None:
             raise ValueError("stream must be provided")
         if dst is None:
@@ -93,8 +141,17 @@ def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
         return dst
 
     def copy_from(self, src: Buffer, *, stream):
-        # Copy from the src buffer to this buffer asynchronously on the
-        # given stream. Raise an exception if the stream is not provided. 
+        """Copy from the src buffer to this buffer asynchronously on the given stream.
+
+        Parameters
+        ----------
+        src : :obj:`Buffer`
+            Source buffer to copy data from
+        stream : Any
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
         if stream is None:
             raise ValueError("stream must be provided")
         if src._size != self._size:
@@ -141,7 +198,7 @@ def __buffer__(self, flags: int, /) -> memoryview:
         raise NotImplementedError("TODO")
 
     def __release_buffer__(self, buffer: memoryview, /):
-        # Supporting methond paired with __buffer__.
+        # Supporting method paired with __buffer__.
         raise NotImplementedError("TODO")
 
 
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 60d4db97..8b0ff9a7 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -46,6 +46,15 @@ def _lazy_init():
 
 
 class Kernel:
+    """Represent a compiled kernel that had been loaded onto the device.
+
+    Kernel instances can execution when passed directly into the
+    :func:`~cuda.core.experimental.launch` function.
+
+    Directly creating a :obj:`Kernel` is not supported, and they
+    should instead be created through a :obj:`ObjectCode` object.
+
+    """
 
     __slots__ = ("_handle", "_module",)
 
@@ -65,6 +74,35 @@ def _from_obj(obj, mod):
 
 
 class ObjectCode:
+    """Represent a compiled program that was loaded onto the device.
+
+    This object provides a unified interface for different types of
+    compiled programs that are loaded onto the device.
+
+    Loads the module library with specified module code and JIT options.
+
+    Note
+    ----
+    Usage under CUDA 11.x will only load to the current device
+    context.
+
+    Parameters
+    ----------
+    module : Union[bytes, str]
+        Either a bytes object containing the module to load, or
+        a file path string containing that module for loading.
+    code_type : Any
+        String of the compiled type.
+        Supported options are "ptx", "cubin" and "fatbin".
+    jit_options : Optional
+        Mapping of JIT options to use during module loading.
+        (Default to no options)
+    symbol_mapping : Optional
+        Keyword argument dictionary specifying how symbol names
+        should be mapped before trying to retrieve them.
+        (Default to no mappings)
+
+    """
 
     __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "fatbin")
@@ -107,6 +145,19 @@ def __del__(self):
         pass
 
     def get_kernel(self, name):
+        """Return the :obj:`Kernel` of a specified name from this object code.
+
+        Parameters
+        ----------
+        name : Any
+            Name of the kernel to retrieve.
+
+        Returns
+        -------
+        :obj:`Kernel`
+            Newly created kernel object.
+
+        """
         try:
             name = self._sym_map[name]
         except KeyError:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index ec0778a3..5439c74a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -8,6 +8,21 @@
 
 
 class Program:
+    """Represent a compilation machinery to process programs into
+    :obj:`~cuda.core.experimental._module.ObjectCode`.
+
+    This object provides a unified interface to multiple underlying
+    compiler libraries. Compilation support is enabled for a wide
+    range of code types and compilation types.
+
+    Parameters
+    ----------
+    code : Any
+        String of the CUDA Runtime Compilation program.
+    code_type : Any
+        String of the code type. Currently only ``"c++"`` is supported.
+
+    """
 
     __slots__ = ("_handle", "_backend", )
     _supported_code_type = ("c++", )
@@ -30,14 +45,40 @@ def __init__(self, code, code_type):
             raise NotImplementedError
 
     def __del__(self):
+        """Return close(self)."""
         self.close()
 
     def close(self):
+        """Destroy this program."""
         if self._handle is not None:
             handle_return(nvrtc.nvrtcDestroyProgram(self._handle))
             self._handle = None
 
     def compile(self, target_type, options=(), name_expressions=(), logs=None):
+        """Compile the program with a specific compilation type.
+
+        Parameters
+        ----------
+        target_type : Any
+            String of the targeted compilation type.
+            Supported options are "ptx", "cubin" and "ltoir".
+        options : Union[List, Tuple], optional
+            List of compilation options associated with the backend
+            of this :obj:`Program`. (Default to no options)
+        name_expressions : Union[List, Tuple], optional
+            List of explicit name expressions to become accessible.
+            (Default to no expressions)
+        logs : Any, optional
+            Object with a write method to receive the logs generated
+            from compilation.
+            (Default to no logs)
+
+        Returns
+        -------
+        :obj:`~cuda.core.experimental._module.ObjectCode`
+            Newly created code object.
+
+        """
         if target_type not in self._supported_target_type:
             raise NotImplementedError
 
@@ -80,8 +121,10 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
 
     @property
     def backend(self):
+        """Return the backend type string associated with this program."""
         return self._backend
 
     @property
     def handle(self):
+        """Return the program handle object."""
         return self._handle
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 95f8ec50..7f50dafd 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -20,12 +20,38 @@
 
 @dataclass
 class StreamOptions:
+    """Customizable :obj:`Stream` options.
 
+    Attributes
+    ----------
+    nonblocking : bool, optional
+        Stream does not synchronize with the NULL stream. (Default to True)
+    priority : int, optional
+        Stream priority where lower number represents a
+        higher priority. (Default to lowest priority)
+
+    """
     nonblocking: bool = True
     priority: Optional[int] = None
 
 
 class Stream:
+    """Represent a queue of GPU operations that are executed in a specific order.
+
+    Applications use streams to control the order of execution for
+    GPU work. Work within a single stream are executed sequentially.
+    Whereas work across multiple streams can be further controlled
+    using stream priorities and :obj:`Event` managements.
+
+    Advanced users can utilize default streams for enforce complex
+    implicit synchronization behaviors.
+
+    Directly creating a :obj:`Stream` is not supported due to ambiguity.
+    New streams should instead be created through a :obj:`Device`
+    object, or created directly through using an existing handle
+    using Stream.from_handle().
+
+    """
 
     __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin",
                  "_device_id", "_ctx_handle")
@@ -74,13 +100,12 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
         else:
             flags = cuda.CUstream_flags.CU_STREAM_DEFAULT
 
+        high, low = handle_return(cudart.cudaDeviceGetStreamPriorityRange())
         if priority is not None:
-            high, low = handle_return(
-                cudart.cudaDeviceGetStreamPriorityRange())
             if not (low <= priority <= high):
                 raise ValueError(f"{priority=} is out of range {[low, high]}")
         else:
-            priority = 0
+            priority = high
 
         self._handle = handle_return(
             cuda.cuStreamCreateWithPriority(flags, priority))
@@ -94,9 +119,16 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
         return self
 
     def __del__(self):
+        """Return close(self)."""
         self.close()
 
     def close(self):
+        """Destroy the stream.
+
+        Destroy the stream if we own it. Borrowed foreign stream
+        object will instead have their references released.
+
+        """
         if self._owner is None:
             if self._handle and not self._builtin:
                 handle_return(cuda.cuStreamDestroy(self._handle))
@@ -106,15 +138,17 @@ def close(self):
 
     @property
     def __cuda_stream__(self) -> Tuple[int, int]:
+        """Return an instance of a __cuda_stream__ protocol."""
         return (0, int(self._handle))
 
     @property
     def handle(self) -> int:
-        # Return the underlying cudaStream_t pointer address as Python int.
+        """Return the underlying cudaStream_t pointer address as Python int."""
         return int(self._handle)
 
     @property
     def is_nonblocking(self) -> bool:
+        """Return True if this is a nonblocking stream, otherwise False."""
         if self._nonblocking is None:
             flag = handle_return(cuda.cuStreamGetFlags(self._handle))
             if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING:
@@ -125,15 +159,35 @@ def is_nonblocking(self) -> bool:
 
     @property
     def priority(self) -> int:
+        """Return the stream priority."""
         if self._priority is None:
             prio = handle_return(cuda.cuStreamGetPriority(self._handle))
             self._priority = prio
         return self._priority
 
     def sync(self):
+        """Synchronize the stream."""
         handle_return(cuda.cuStreamSynchronize(self._handle))
 
     def record(self, event: Event=None, options: EventOptions=None) -> Event:
+        """Record an event onto the stream.
+
+        Creates an Event object (or reuses the given one) by
+        recording on the stream.
+
+        Parameters
+        ----------
+        event : :obj:`Event`, optional
+            Optional event object to be reused for recording.
+        options : :obj:`EventOptions`, optional
+            Customizable dataclass for event creation options.
+
+        Returns
+        -------
+        :obj:`Event`
+            Newly created event object.
+
+        """
         # Create an Event object (or reusing the given one) by recording
         # on the stream. Event flags such as disabling timing, nonblocking,
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
@@ -145,11 +199,15 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event:
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
-        # Wait for a CUDA event or a CUDA stream to establish a stream order.
-        #
-        # If a Stream instance is provided, the effect is as if an event is
-        # recorded on the given stream, and then self waits on the recorded
-        # event.
+        """Wait for a CUDA event or a CUDA stream.
+
+        Waiting for an event or a stream establishes a stream order.
+
+        If a :obj:`Stream` is provided, then wait until the stream's
+        work is completed. This is done by recording a new :obj:`Event`
+        on the stream and then waiting on it.
+
+        """
         if isinstance(event_or_stream, Event):
             event = event_or_stream.handle
             discard_event = False
@@ -175,12 +233,15 @@ def wait(self, event_or_stream: Union[Event, Stream]):
 
     @property
     def device(self) -> Device:
-        # Inverse look-up to find on which device this stream instance was
-        # created.
-        #
-        # Note that Stream.device.context might not necessarily agree with
-        # Stream.context, in cases where a different CUDA context is set
-        # current after a stream was created.
+        """Return the :obj:`Device` singleton associated with this stream.
+
+        Note
+        ----
+        The current context on the device may differ from this
+        stream's context. This case occurs when a different CUDA
+        context is set current after a stream is created.
+
+        """
         from cuda.core.experimental._device import Device  # avoid circular import
         if self._device_id is None:
             # Get the stream context first
@@ -192,8 +253,7 @@ def device(self) -> Device:
 
     @property
     def context(self) -> Context:
-        # Inverse look-up to find in which CUDA context this stream instance
-        # was created
+        """Return the :obj:`Context` associated with this stream."""
         if self._ctx_handle is None:
             self._ctx_handle = handle_return(
                 cuda.cuStreamGetCtx(self._handle))
@@ -203,6 +263,28 @@ def context(self) -> Context:
 
     @staticmethod
     def from_handle(handle: int) -> Stream:
+        """Create a new :obj:`Stream` object from a foreign stream handle.
+
+        Uses a cudaStream_t pointer address represented as a Python int
+        to create a new :obj:`Stream` object.
+
+        Note
+        ----
+        Stream lifetime is not managed, foreign object must remain
+        alive while this steam is active.
+
+        Parameters
+        ----------
+        handle : int
+            Stream handle representing the address of a foreign
+            stream object.
+
+        Returns
+        -------
+        :obj:`Stream`
+            Newly created stream object.
+
+        """
         class _stream_holder:
             @property
             def __cuda_stream__(self):
@@ -235,6 +317,15 @@ def __init__(self):
 
 
 def default_stream():
+    """Return the default CUDA :obj:`Stream`.
+
+    The type of default stream returned depends on if the environment
+    variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
+
+    If set, returns a per-thread default stream. Otherwise returns
+    the legacy stream.
+
+    """
     # TODO: flip the default
     use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0))
     if use_ptds:
diff --git a/cuda_core/docs/source/_templates/autosummary/class.rst b/cuda_core/docs/source/_templates/autosummary/class.rst
new file mode 100644
index 00000000..b45a3fd5
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/class.rst
@@ -0,0 +1,26 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   {% for item in methods %}
+   .. automethod:: {{ item }}
+   {%- endfor %}
+
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: {{ _('Attributes') }}
+
+   {% for item in attributes %}
+   .. autoattribute:: {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst
new file mode 100644
index 00000000..b8c35324
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst
@@ -0,0 +1,10 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {% block methods %}
+   .. automethod:: __init__
+   {% endblock %}
+
diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst
new file mode 100644
index 00000000..d3ad7d24
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst
@@ -0,0 +1,8 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members: __new__
+   :special-members: __new__
+   :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__
\ No newline at end of file
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 756ed776..1cb9811b 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -3,6 +3,11 @@
 ``cuda.core.experimental`` API Reference
 ========================================
 
+All of the APIs listed (or cross-referenced from) below are considered *experimental*
+and subject to future changes without deprecation notice. Once stablized they will be
+moved out of the ``experimental`` namespace.
+
+
 CUDA runtime
 ------------
 
@@ -10,6 +15,14 @@ CUDA runtime
    :toctree: generated/
 
    Device
+   launch
+
+   :template: dataclass.rst
+
+   EventOptions
+   StreamOptions
+   LaunchConfig
+
 
 CUDA compilation toolchain
 --------------------------
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
new file mode 100644
index 00000000..f100eb7c
--- /dev/null
+++ b/cuda_core/docs/source/api_private.rst
@@ -0,0 +1,28 @@
+:orphan:
+
+.. This page is to generate documentation for private classes exposed to users,
+   i.e., users cannot instantiate it by themselves but may use it's properties
+   or methods via returned values from public APIs. These classes must be referred
+   in public APIs returning their instances.
+
+.. currentmodule:: cuda.core.experimental
+
+CUDA runtime
+------------
+
+.. autosummary::
+   :toctree: generated/
+
+   _memory.Buffer
+   _stream.Stream
+   _event.Event
+
+
+CUDA compilation toolchain
+--------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   _module.Kernel
+   _module.ObjectCode
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 4be77656..5b28d331 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -34,7 +34,8 @@
     'sphinx.ext.autosummary',
 	'sphinx.ext.napoleon',
 	'myst_nb',
-	'enum_tools.autoenum'
+	'enum_tools.autoenum',
+    'sphinx_copybutton',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -77,3 +78,6 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
+
+# skip cmdline prompts
+copybutton_exclude = '.linenos, .gp'
diff --git a/cuda_core/docs/source/install.md b/cuda_core/docs/source/install.md
index 593f7225..e13f37df 100644
--- a/cuda_core/docs/source/install.md
+++ b/cuda_core/docs/source/install.md
@@ -5,25 +5,19 @@
 `cuda.core` is supported on all platforms that CUDA is supported. Specific
 dependencies are as follows:
 
-* Driver: Linux (450.80.02 or later) Windows (456.38 or later)
-* CUDA Toolkit 12.0 to 12.6
+|                   | CUDA 11      | CUDA 12     |
+|------------------ | ------------ | ----------- |
+| CUDA Toolkit [^1] | 11.2 - 11.8  | 12.0 - 12.6 |
+| Driver            | 450.80.02+ (Linux), 452.39+ (Windows) | 525.60.13+ (Linux), 527.41+ (Windows) |
 
-
-## Installing from PyPI
-
-Coming soon!
-
-
-## Installing from Conda
-
-Coming soon!
+[^1]: Including `cuda-python`.
 
 
 ## Installing from Source
 
-```shell
+```console
 $ git clone https://github.com/NVIDIA/cuda-python
 $ cd cuda-python/cuda_core
 $ pip install .
 ```
-For now `cuda-python` (`cuda-bindings` later) is a required dependency.
+For now `cuda-python` (`cuda-bindings` later) 11.x or 12.x is a required dependency.
diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md
index 2247e0cb..1ebb41f9 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.0-notes.md
@@ -1,15 +1,17 @@
-# ``cuda.core`` Release notes
+# `cuda.core` Release notes
 
-Released on Oct XX, 2024
+Released on Nov XX, 2024
 
 ## Hightlights
-- Initial beta 1 release
+- Initial EA1 (early access) release
 - Supports all platforms that CUDA is supported
-- Supports all CUDA 12.x drivers
-- Supports all CUDA 12.x Toolkits
+- Supports all CUDA 11.x/12.x drivers
+- Supports all CUDA 11.x/12.x Toolkits
 - Pythonic CUDA runtime and other core functionalities
 
 ## Limitations
 
-- Source code release only; Python packages coming in a future release
-- Support for CUDA 11.x coming in the next release
+- All APIs are currently *experimental* and subject to change without deprecation notice.
+  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+- Source code release only; `pip`/`conda` support is coming in a future release
+- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
diff --git a/cuda_python/docs/environment-docs.yml b/cuda_python/docs/environment-docs.yml
index 2a3a8ad3..bc9588fd 100644
--- a/cuda_python/docs/environment-docs.yml
+++ b/cuda_python/docs/environment-docs.yml
@@ -11,6 +11,7 @@ dependencies:
   - pytest
   - scipy
   - sphinx
+  - sphinx-copybutton
   - pip:
     - furo
     - myst-nb