From 89c403d6a9196ac97d4d3acde84fc72b1fd82089 Mon Sep 17 00:00:00 2001
From: Daniel Peter <daniel.peter22@gmail.com>
Date: Wed, 3 May 2023 13:28:06 +0200
Subject: [PATCH] adds CUDA Hopper support

---
 Makefile.in                                   |  8 ++++++-
 configure                                     | 15 ++++++++++++
 configure.ac                                  |  4 +++-
 src/gpu/kernels/Kernel_2_viscoelastic_impl.cu |  2 --
 src/gpu/mesh_constants_cuda.h                 | 23 +++++++++++++++----
 src/gpu/rules.mk                              |  3 +++
 6 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index 0ba94b640..bfd35256c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -145,6 +145,9 @@ SCOTCH_LIBDIR = @SCOTCH_LIBDIR@
 @COND_CUDA11_TRUE@CUDA11 = yes
 @COND_CUDA11_FALSE@CUDA11 = no
 
+@COND_CUDA12_TRUE@CUDA12 = yes
+@COND_CUDA12_FALSE@CUDA12 = no
+
 # CUDA compilation with linking
 @COND_CUDA_PLUS_TRUE@CUDA_PLUS = yes
 @COND_CUDA_PLUS_FALSE@CUDA_PLUS = no
@@ -173,7 +176,7 @@ CUDA_LINK = @CUDA_LDFLAGS@ @CUDA_LIBS@ -lstdc++
 # Volta   (cuda9, V100): -gencode=arch=compute_70,code=sm_70
 # Turing  (cuda10, T4, GeForce RTX 2080): -gencode=arch=compute_75,code=sm_75
 # Ampere  (cuda11, A100, GeForce RTX 3080): -gencode=arch=compute_80,code=sm_80
-
+# Hopper  (cuda12, H100): -gencode=arch=compute_90,code=sm_90
 GENCODE_20 = -gencode=arch=compute_20,code=\"sm_20,compute_20\"
 GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
@@ -184,8 +187,11 @@ GENCODE_60 = -gencode=arch=compute_60,code=\"sm_60,compute_60\"
 GENCODE_70 = -gencode=arch=compute_70,code=\"sm_70,compute_70\"
 GENCODE_75 = -gencode=arch=compute_75,code=\"sm_75,compute_75\"
 GENCODE_80 = -gencode=arch=compute_80,code=\"sm_80,compute_80\"
+GENCODE_90 = -gencode=arch=compute_90,code=\"sm_90,compute_90\"
 
 # cuda preprocessor flag
+# CUDA version 12.0
+@COND_CUDA_TRUE@@COND_CUDA12_TRUE@GENCODE = $(GENCODE_90) $(FC_DEFINE)GPU_DEVICE_Hopper
 # CUDA version 11.0
 @COND_CUDA_TRUE@@COND_CUDA11_TRUE@GENCODE = $(GENCODE_80) $(FC_DEFINE)GPU_DEVICE_Ampere
 # CUDA version 10.0
diff --git a/configure b/configure
index a25b98111..1ff46dd30 100755
--- a/configure
+++ b/configure
@@ -715,6 +715,8 @@ COND_OMP_FALSE
 COND_OMP_TRUE
 COND_CUDA_PLUS_FALSE
 COND_CUDA_PLUS_TRUE
+COND_CUDA12_FALSE
+COND_CUDA12_TRUE
 COND_CUDA11_FALSE
 COND_CUDA11_TRUE
 COND_CUDA10_FALSE
@@ -3326,6 +3328,14 @@ else
   COND_CUDA11_FALSE=
 fi
 
+ if test x"$want_cuda" = xcuda12; then
+  COND_CUDA12_TRUE=
+  COND_CUDA12_FALSE='#'
+else
+  COND_CUDA12_TRUE='#'
+  COND_CUDA12_FALSE=
+fi
+
 
 # cuda linking for cuda 5x and 6x and 7x and 8x and ..
  if test "$want_cuda" = cuda4 \
@@ -3336,6 +3346,7 @@ fi
      -o "$want_cuda" = cuda9 \
      -o "$want_cuda" = cuda10 \
      -o "$want_cuda" = cuda11 \
+     -o "$want_cuda" = cuda12 \
 ; then
   COND_CUDA_PLUS_TRUE=
   COND_CUDA_PLUS_FALSE='#'
@@ -8956,6 +8967,10 @@ if test -z "${COND_CUDA11_TRUE}" && test -z "${COND_CUDA11_FALSE}"; then
   as_fn_error $? "conditional \"COND_CUDA11\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${COND_CUDA12_TRUE}" && test -z "${COND_CUDA12_FALSE}"; then
+  as_fn_error $? "conditional \"COND_CUDA12\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${COND_CUDA_PLUS_TRUE}" && test -z "${COND_CUDA_PLUS_FALSE}"; then
   as_fn_error $? "conditional \"COND_CUDA_PLUS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
diff --git a/configure.ac b/configure.ac
index aea34d26d..b3c618520 100644
--- a/configure.ac
+++ b/configure.ac
@@ -90,6 +90,7 @@ AM_CONDITIONAL([COND_CUDA8], [test x"$want_cuda" = xcuda8])
 AM_CONDITIONAL([COND_CUDA9], [test x"$want_cuda" = xcuda9])
 AM_CONDITIONAL([COND_CUDA10], [test x"$want_cuda" = xcuda10])
 AM_CONDITIONAL([COND_CUDA11], [test x"$want_cuda" = xcuda11])
+AM_CONDITIONAL([COND_CUDA12], [test x"$want_cuda" = xcuda12])
 
 # cuda linking for cuda 5x and 6x and 7x and 8x and ..
 AM_CONDITIONAL([COND_CUDA_PLUS],
@@ -100,7 +101,8 @@ AM_CONDITIONAL([COND_CUDA_PLUS],
      -o "$want_cuda" = cuda8 \
      -o "$want_cuda" = cuda9 \
      -o "$want_cuda" = cuda10 \
-     -o "$want_cuda" = cuda11 \]
+     -o "$want_cuda" = cuda11 \
+     -o "$want_cuda" = cuda12 \]
 )
 
 ###
diff --git a/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu b/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
index b179d2759..4daa9771d 100644
--- a/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
+++ b/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
@@ -35,7 +35,6 @@
 
 
 #ifdef USE_TEXTURES_FIELDS
-
 realw_texture d_displ_tex;
 realw_texture d_accel_tex;
 // backward/reconstructed
@@ -59,7 +58,6 @@ template<> __device__ float texfetch_accel<1>(int x) { return tex1Dfetch(d_accel
 // FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
 template<> __device__ float texfetch_displ<3>(int x) { return tex1Dfetch(d_b_displ_tex, x); }
 template<> __device__ float texfetch_accel<3>(int x) { return tex1Dfetch(d_b_accel_tex, x); }
-
 #endif
 
 #ifdef USE_TEXTURES_CONSTANTS
diff --git a/src/gpu/mesh_constants_cuda.h b/src/gpu/mesh_constants_cuda.h
index f2521cadb..0ea00a46c 100644
--- a/src/gpu/mesh_constants_cuda.h
+++ b/src/gpu/mesh_constants_cuda.h
@@ -153,11 +153,11 @@
 //#define USE_TEXTURES_CONSTANTS  // might not working properly yet, please test on your card...
 
 #ifdef USE_CUDA
-// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
-#if CUDA_VERSION < 4000
-#undef USE_TEXTURES_FIELDS
-#undef USE_TEXTURES_CONSTANTS
-#endif
+  // CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
+  #if CUDA_VERSION < 4000 || (defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 4))
+    #undef USE_TEXTURES_FIELDS
+    #undef USE_TEXTURES_CONSTANTS
+  #endif
 #endif
 
 // CUDA compiler specifications
@@ -236,6 +236,14 @@
 #undef USE_LAUNCH_BOUNDS
 #endif
 
+#ifdef GPU_DEVICE_Hopper
+// specifics see: https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html
+// register file size 64k 32-bit registers per SM
+// shared memory size 228KB per SM (maximum shared memory, 227KB per thread block)
+// maximum registers 255 per thread
+#undef USE_LAUNCH_BOUNDS
+#endif
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda kernel block size for updating displacements/potential (newmark time scheme)
@@ -287,7 +295,12 @@
 typedef float realw;
 
 // textures
+// note: texture templates are supported only for CUDA versions <= 11.x
+//       since CUDA 12.x, these are deprecated and texture objects should be used instead
+//       see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
+#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS)
 typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
+#endif
 
 // pointer declarations
 // restricted pointers: can improve performance on Kepler ~ 10%
diff --git a/src/gpu/rules.mk b/src/gpu/rules.mk
index a0bb3c020..20917c27e 100644
--- a/src/gpu/rules.mk
+++ b/src/gpu/rules.mk
@@ -122,6 +122,9 @@ ifeq ($(CUDA),yes)
   ifeq ($(CUDA11),yes)
     BUILD_VERSION_TXT += (v11)
   endif
+  ifeq ($(CUDA12),yes)
+    BUILD_VERSION_TXT += (v12)
+  endif
 endif
 
 BUILD_VERSION_TXT += support