From 89c403d6a9196ac97d4d3acde84fc72b1fd82089 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 3 May 2023 13:28:06 +0200 Subject: [PATCH] adds CUDA Hopper support --- Makefile.in | 8 ++++++- configure | 15 ++++++++++++ configure.ac | 4 +++- src/gpu/kernels/Kernel_2_viscoelastic_impl.cu | 2 -- src/gpu/mesh_constants_cuda.h | 23 +++++++++++++++---- src/gpu/rules.mk | 3 +++ 6 files changed, 46 insertions(+), 9 deletions(-) diff --git a/Makefile.in b/Makefile.in index 0ba94b640..bfd35256c 100644 --- a/Makefile.in +++ b/Makefile.in @@ -145,6 +145,9 @@ SCOTCH_LIBDIR = @SCOTCH_LIBDIR@ @COND_CUDA11_TRUE@CUDA11 = yes @COND_CUDA11_FALSE@CUDA11 = no +@COND_CUDA12_TRUE@CUDA12 = yes +@COND_CUDA12_FALSE@CUDA12 = no + # CUDA compilation with linking @COND_CUDA_PLUS_TRUE@CUDA_PLUS = yes @COND_CUDA_PLUS_FALSE@CUDA_PLUS = no @@ -173,7 +176,7 @@ CUDA_LINK = @CUDA_LDFLAGS@ @CUDA_LIBS@ -lstdc++ # Volta (cuda9, V100): -gencode=arch=compute_70,code=sm_70 # Turing (cuda10, T4, GeForce RTX 2080): -gencode=arch=compute_75,code=sm_75 # Ampere (cuda11, A100, GeForce RTX 3080): -gencode=arch=compute_80,code=sm_80 - +# Hopper (cuda12, H100): -gencode=arch=compute_90,code=sm_90 GENCODE_20 = -gencode=arch=compute_20,code=\"sm_20,compute_20\" GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\" GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\" @@ -184,8 +187,11 @@ GENCODE_60 = -gencode=arch=compute_60,code=\"sm_60,compute_60\" GENCODE_70 = -gencode=arch=compute_70,code=\"sm_70,compute_70\" GENCODE_75 = -gencode=arch=compute_75,code=\"sm_75,compute_75\" GENCODE_80 = -gencode=arch=compute_80,code=\"sm_80,compute_80\" +GENCODE_90 = -gencode=arch=compute_90,code=\"sm_90,compute_90\" # cuda preprocessor flag +# CUDA version 12.0 +@COND_CUDA_TRUE@@COND_CUDA12_TRUE@GENCODE = $(GENCODE_90) $(FC_DEFINE)GPU_DEVICE_Hopper # CUDA version 11.0 @COND_CUDA_TRUE@@COND_CUDA11_TRUE@GENCODE = $(GENCODE_80) $(FC_DEFINE)GPU_DEVICE_Ampere # CUDA version 10.0 diff --git a/configure b/configure index a25b98111..1ff46dd30 100755 --- a/configure +++ b/configure @@ -715,6 +715,8 @@ COND_OMP_FALSE COND_OMP_TRUE COND_CUDA_PLUS_FALSE COND_CUDA_PLUS_TRUE +COND_CUDA12_FALSE +COND_CUDA12_TRUE COND_CUDA11_FALSE COND_CUDA11_TRUE COND_CUDA10_FALSE @@ -3326,6 +3328,14 @@ else COND_CUDA11_FALSE= fi + if test x"$want_cuda" = xcuda12; then + COND_CUDA12_TRUE= + COND_CUDA12_FALSE='#' +else + COND_CUDA12_TRUE='#' + COND_CUDA12_FALSE= +fi + # cuda linking for cuda 5x and 6x and 7x and 8x and .. if test "$want_cuda" = cuda4 \ @@ -3336,6 +3346,7 @@ fi -o "$want_cuda" = cuda9 \ -o "$want_cuda" = cuda10 \ -o "$want_cuda" = cuda11 \ + -o "$want_cuda" = cuda12 \ ; then COND_CUDA_PLUS_TRUE= COND_CUDA_PLUS_FALSE='#' @@ -8956,6 +8967,10 @@ if test -z "${COND_CUDA11_TRUE}" && test -z "${COND_CUDA11_FALSE}"; then as_fn_error $? "conditional \"COND_CUDA11\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${COND_CUDA12_TRUE}" && test -z "${COND_CUDA12_FALSE}"; then + as_fn_error $? "conditional \"COND_CUDA12\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${COND_CUDA_PLUS_TRUE}" && test -z "${COND_CUDA_PLUS_FALSE}"; then as_fn_error $? "conditional \"COND_CUDA_PLUS\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 diff --git a/configure.ac b/configure.ac index aea34d26d..b3c618520 100644 --- a/configure.ac +++ b/configure.ac @@ -90,6 +90,7 @@ AM_CONDITIONAL([COND_CUDA8], [test x"$want_cuda" = xcuda8]) AM_CONDITIONAL([COND_CUDA9], [test x"$want_cuda" = xcuda9]) AM_CONDITIONAL([COND_CUDA10], [test x"$want_cuda" = xcuda10]) AM_CONDITIONAL([COND_CUDA11], [test x"$want_cuda" = xcuda11]) +AM_CONDITIONAL([COND_CUDA12], [test x"$want_cuda" = xcuda12]) # cuda linking for cuda 5x and 6x and 7x and 8x and .. AM_CONDITIONAL([COND_CUDA_PLUS], @@ -100,7 +101,8 @@ AM_CONDITIONAL([COND_CUDA_PLUS], -o "$want_cuda" = cuda8 \ -o "$want_cuda" = cuda9 \ -o "$want_cuda" = cuda10 \ - -o "$want_cuda" = cuda11 \] + -o "$want_cuda" = cuda11 \ + -o "$want_cuda" = cuda12 \] ) ### diff --git a/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu b/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu index b179d2759..4daa9771d 100644 --- a/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu +++ b/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu @@ -35,7 +35,6 @@ #ifdef USE_TEXTURES_FIELDS - realw_texture d_displ_tex; realw_texture d_accel_tex; // backward/reconstructed @@ -59,7 +58,6 @@ template<> __device__ float texfetch_accel<1>(int x) { return tex1Dfetch(d_accel // FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays template<> __device__ float texfetch_displ<3>(int x) { return tex1Dfetch(d_b_displ_tex, x); } template<> __device__ float texfetch_accel<3>(int x) { return tex1Dfetch(d_b_accel_tex, x); } - #endif #ifdef USE_TEXTURES_CONSTANTS diff --git a/src/gpu/mesh_constants_cuda.h b/src/gpu/mesh_constants_cuda.h index f2521cadb..0ea00a46c 100644 --- a/src/gpu/mesh_constants_cuda.h +++ b/src/gpu/mesh_constants_cuda.h @@ -153,11 +153,11 @@ //#define USE_TEXTURES_CONSTANTS // might not working properly yet, please test on your card... #ifdef USE_CUDA -// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize() -#if CUDA_VERSION < 4000 -#undef USE_TEXTURES_FIELDS -#undef USE_TEXTURES_CONSTANTS -#endif + // CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize() + #if CUDA_VERSION < 4000 || (defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 4)) + #undef USE_TEXTURES_FIELDS + #undef USE_TEXTURES_CONSTANTS + #endif #endif // CUDA compiler specifications @@ -236,6 +236,14 @@ #undef USE_LAUNCH_BOUNDS #endif +#ifdef GPU_DEVICE_Hopper +// specifics see: https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html +// register file size 64k 32-bit registers per SM +// shared memory size 228KB per SM (maximum shared memory, 227KB per thread block) +// maximum registers 255 per thread +#undef USE_LAUNCH_BOUNDS +#endif + /* ----------------------------------------------------------------------------------------------- */ // cuda kernel block size for updating displacements/potential (newmark time scheme) @@ -287,7 +295,12 @@ typedef float realw; // textures +// note: texture templates are supported only for CUDA versions <= 11.x +// since CUDA 12.x, these are deprecated and texture objects should be used instead +// see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ +#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS) typedef texture realw_texture; +#endif // pointer declarations // restricted pointers: can improve performance on Kepler ~ 10% diff --git a/src/gpu/rules.mk b/src/gpu/rules.mk index a0bb3c020..20917c27e 100644 --- a/src/gpu/rules.mk +++ b/src/gpu/rules.mk @@ -122,6 +122,9 @@ ifeq ($(CUDA),yes) ifeq ($(CUDA11),yes) BUILD_VERSION_TXT += (v11) endif + ifeq ($(CUDA12),yes) + BUILD_VERSION_TXT += (v12) + endif endif BUILD_VERSION_TXT += support