avoid silly, redundant lock

leofang · leofang · commit 95777c478ca0 · 2025-02-21T19:36:25.000Z
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -11,7 +11,8 @@
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
 
 _tls = threading.local()
-_tls_lock = threading.Lock()
+_lock = threading.Lock()
+_is_cuInit = False
 
 
 class DeviceProperties:
@@ -938,6 +939,12 @@ class Device:
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id=None):
+        global _is_cuInit
+        if _is_cuInit is False:
+            with _lock:
+                handle_return(driver.cuInit(0))
+                _is_cuInit = True
+
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(runtime.cudaGetDevice())
@@ -948,27 +955,26 @@ def __new__(cls, device_id=None):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        with _tls_lock:
-            if not hasattr(_tls, "devices"):
-                total = handle_return(runtime.cudaGetDeviceCount())
-                _tls.devices = []
-                for dev_id in range(total):
-                    dev = super().__new__(cls)
-                    dev._id = dev_id
-                    # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the SynchronousMemoryResource which does not use memory pools.
-                    if (
-                        handle_return(
-                            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
-                        )
-                    ) == 1:
-                        dev._mr = _DefaultAsyncMempool(dev_id)
-                    else:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-
-                    dev._has_inited = False
-                    dev._properties = None
-                    _tls.devices.append(dev)
+        if not hasattr(_tls, "devices"):
+            total = handle_return(runtime.cudaGetDeviceCount())
+            _tls.devices = []
+            for dev_id in range(total):
+                dev = super().__new__(cls)
+                dev._id = dev_id
+                # If the device is in TCC mode, or does not support memory pools for some other reason,
+                # use the SynchronousMemoryResource which does not use memory pools.
+                if (
+                    handle_return(
+                        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                    )
+                ) == 1:
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                else:
+                    dev._mr = _SynchronousMemoryResource(dev_id)
+
+                dev._has_inited = False
+                dev._properties = None
+                _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
@@ -42,8 +42,7 @@ def _device_unset_current():
         return
     handle_return(driver.cuCtxPopCurrent())
     if hasattr(_device._tls, "devices"):
-        with _device._tls_lock:
-            del _device._tls.devices
+        del _device._tls.devices
 
 
 @pytest.fixture(scope="function")