[CIR][CUDA] Generate CUDA destructor (#1470)

AdUhTkJm · web-flow · commit fa5b07c53076 · 2025-03-11T18:49:39.000-07:00
This is Part 3 of registration function generation.

This generates `__cuda_module_dtor`. It cannot be placed in global dtors
list, as treating it as a normal destructor will result in double-free
in recent CUDA versions (see comments in OG). Rather, the function is
passed as callback of `atexit`, which is called at the end of
`__cuda_module_ctor`.
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -127,7 +127,7 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   llvm::StringMap<FuncOp> cudaKernelMap;
 
   void buildCUDAModuleCtor();
-  void buildCUDAModuleDtor();
+  std::optional<FuncOp> buildCUDAModuleDtor();
   std::optional<FuncOp> buildCUDARegisterGlobals();
 
   void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
@@ -1153,6 +1153,23 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
     builder.createCallOp(loc, endFunc, gpuBinaryHandle);
   }
 
+  // Create destructor and register it with atexit() the way NVCC does it. Doing
+  // it during regular destructor phase worked in CUDA before 9.2 but results in
+  // double-free in 9.2.
+  if (auto dtor = buildCUDAModuleDtor()) {
+    // extern "C" int atexit(void (*f)(void));
+    cir::CIRBaseBuilderTy globalBuilder(getContext());
+    globalBuilder.setInsertionPointToStart(theModule.getBody());
+    FuncOp atexit = buildRuntimeFunction(
+        globalBuilder, "atexit", loc,
+        FuncType::get(PointerType::get(dtor->getFunctionType()), intTy));
+
+    mlir::Value dtorFunc = builder.create<GetGlobalOp>(
+        loc, PointerType::get(dtor->getFunctionType()),
+        mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr()));
+    builder.createCallOp(loc, atexit, dtorFunc);
+  }
+
   builder.create<cir::ReturnOp>(loc);
 }
 
@@ -1256,6 +1273,51 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
   }
 }
 
+std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() {
+  if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
+    return {};
+
+  std::string prefix = getCUDAPrefix(astCtx);
+
+  auto voidTy = VoidType::get(&getContext());
+  auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy));
+
+  auto loc = theModule.getLoc();
+
+  cir::CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(theModule.getBody());
+
+  // void __cudaUnregisterFatBinary(void ** handle);
+  std::string unregisterFuncName =
+      addUnderscoredPrefix(prefix, "UnregisterFatBinary");
+  FuncOp unregisterFunc = buildRuntimeFunction(
+      builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy));
+
+  // void __cuda_module_dtor();
+  // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be
+  // put into globalDtorList. If it were a real dtor, then it would cause double
+  // free above CUDA 9.2. The way to use it is to manually call atexit() at end
+  // of module ctor.
+  std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor");
+  FuncOp dtor =
+      buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy),
+                           GlobalLinkageKind::InternalLinkage);
+
+  builder.setInsertionPointToStart(dtor.addEntryBlock());
+
+  // For dtor, we only need to call:
+  //    __cudaUnregisterFatBinary(__cuda_gpubin_handle);
+
+  std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle");
+  auto gpubinGlobal = cast<GlobalOp>(theModule.lookupSymbol(gpubinName));
+  mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal);
+  mlir::Value gpubin = builder.createLoad(loc, gpubinAddress);
+  builder.createCallOp(loc, unregisterFunc, gpubin);
+  builder.create<ReturnOp>(loc);
+
+  return dtor;
+}
+
 void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
@@ -1537,9 +1599,6 @@ void LoweringPreparePass::runOnOperation() {
     datalayout.emplace(theModule);
   }
 
-  auto typeSizeInfo = cast<TypeSizeInfoAttr>(
-      theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
-
   llvm::SmallVector<Operation *> opsToTransform;
 
   op->walk([&](Operation *op) {
diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu
@@ -18,6 +18,15 @@
 // CIR-HOST:   cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>]
 // CIR-HOST: }
 
+// Module destructor goes here.
+// This is not a real destructor, as explained in LoweringPrepare.
+
+// CIR-HOST: cir.func internal private @__cuda_module_dtor() {
+// CIR-HOST:   %[[#HandleGlobal:]] = cir.get_global @__cuda_gpubin_handle
+// CIR-HOST:   %[[#Handle:]] = cir.load %0
+// CIR-HOST:   cir.call @__cudaUnregisterFatBinary(%[[#Handle]])
+// CIR-HOST: }
+
 // CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" =
 // CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros>
 
@@ -33,6 +42,12 @@
 // LLVM-HOST: }
 // LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
 
+// LLVM-HOST: define internal void @__cuda_module_dtor() {
+// LLVM-HOST:   %[[#LLVMHandleVar:]] = load ptr, ptr @__cuda_gpubin_handle, align 8
+// LLVM-HOST:   call void @__cudaUnregisterFatBinary(ptr %[[#LLVMHandleVar]])
+// LLVM-HOST:   ret void
+// LLVM-HOST: }
+
 __global__ void fn() {}
 
 // CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) {
@@ -83,12 +98,15 @@ __global__ void fn() {}
 // CIR-HOST:   %[[#FatbinGlobal:]] = cir.get_global @__cuda_gpubin_handle
 // CIR-HOST:   cir.store %[[#Fatbin]], %[[#FatbinGlobal]]
 // CIR-HOST:   cir.call @__cuda_register_globals
-// CIR-HOTS:   cir.call @__cudaRegisterFatBinaryEnd
+// CIR-HOST:   cir.call @__cudaRegisterFatBinaryEnd
+// CIR-HOST:   %[[#ModuleDtor:]] = cir.get_global @__cuda_module_dtor
+// CIR-HOST:   cir.call @atexit(%[[#ModuleDtor]])
 // CIR-HOST: }
 
 // LLVM-HOST: define internal void @__cuda_module_ctor() {
-// LLVM-HOST:  %[[#LLVMFatbin:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
-// LLVM-HOST:  store ptr %[[#LLVMFatbin]], ptr @__cuda_gpubin_handle
-// LLVM-HOST:  call void @__cuda_register_globals
-// LLVM-HOST:  call void @__cudaRegisterFatBinaryEnd
+// LLVM-HOST:   %[[#LLVMFatbin:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// LLVM-HOST:   store ptr %[[#LLVMFatbin]], ptr @__cuda_gpubin_handle
+// LLVM-HOST:   call void @__cuda_register_globals
+// LLVM-HOST:   call void @__cudaRegisterFatBinaryEnd
+// LLVM-HOST:   call i32 @atexit(ptr @__cuda_module_dtor)
 // LLVM-HOST: }