Skip to content

Commit

Permalink
adds CUDA Hopper support
Browse files Browse the repository at this point in the history
  • Loading branch information
danielpeter committed May 3, 2023
1 parent a423452 commit 89c403d
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 9 deletions.
8 changes: 7 additions & 1 deletion Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ SCOTCH_LIBDIR = @SCOTCH_LIBDIR@
@COND_CUDA11_TRUE@CUDA11 = yes
@COND_CUDA11_FALSE@CUDA11 = no

@COND_CUDA12_TRUE@CUDA12 = yes
@COND_CUDA12_FALSE@CUDA12 = no

# CUDA compilation with linking
@COND_CUDA_PLUS_TRUE@CUDA_PLUS = yes
@COND_CUDA_PLUS_FALSE@CUDA_PLUS = no
Expand Down Expand Up @@ -173,7 +176,7 @@ CUDA_LINK = @CUDA_LDFLAGS@ @CUDA_LIBS@ -lstdc++
# Volta (cuda9, V100): -gencode=arch=compute_70,code=sm_70
# Turing (cuda10, T4, GeForce RTX 2080): -gencode=arch=compute_75,code=sm_75
# Ampere (cuda11, A100, GeForce RTX 3080): -gencode=arch=compute_80,code=sm_80

# Hopper (cuda12, H100): -gencode=arch=compute_90,code=sm_90
GENCODE_20 = -gencode=arch=compute_20,code=\"sm_20,compute_20\"
GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\"
GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
Expand All @@ -184,8 +187,11 @@ GENCODE_60 = -gencode=arch=compute_60,code=\"sm_60,compute_60\"
GENCODE_70 = -gencode=arch=compute_70,code=\"sm_70,compute_70\"
GENCODE_75 = -gencode=arch=compute_75,code=\"sm_75,compute_75\"
GENCODE_80 = -gencode=arch=compute_80,code=\"sm_80,compute_80\"
GENCODE_90 = -gencode=arch=compute_90,code=\"sm_90,compute_90\"

# cuda preprocessor flag
# CUDA version 12.0
@COND_CUDA_TRUE@@COND_CUDA12_TRUE@GENCODE = $(GENCODE_90) $(FC_DEFINE)GPU_DEVICE_Hopper
# CUDA version 11.0
@COND_CUDA_TRUE@@COND_CUDA11_TRUE@GENCODE = $(GENCODE_80) $(FC_DEFINE)GPU_DEVICE_Ampere
# CUDA version 10.0
Expand Down
15 changes: 15 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,8 @@ COND_OMP_FALSE
COND_OMP_TRUE
COND_CUDA_PLUS_FALSE
COND_CUDA_PLUS_TRUE
COND_CUDA12_FALSE
COND_CUDA12_TRUE
COND_CUDA11_FALSE
COND_CUDA11_TRUE
COND_CUDA10_FALSE
Expand Down Expand Up @@ -3326,6 +3328,14 @@ else
COND_CUDA11_FALSE=
fi

if test x"$want_cuda" = xcuda12; then
COND_CUDA12_TRUE=
COND_CUDA12_FALSE='#'
else
COND_CUDA12_TRUE='#'
COND_CUDA12_FALSE=
fi


# cuda linking for cuda 5x and 6x and 7x and 8x and ..
if test "$want_cuda" = cuda4 \
Expand All @@ -3336,6 +3346,7 @@ fi
-o "$want_cuda" = cuda9 \
-o "$want_cuda" = cuda10 \
-o "$want_cuda" = cuda11 \
-o "$want_cuda" = cuda12 \
; then
COND_CUDA_PLUS_TRUE=
COND_CUDA_PLUS_FALSE='#'
Expand Down Expand Up @@ -8956,6 +8967,10 @@ if test -z "${COND_CUDA11_TRUE}" && test -z "${COND_CUDA11_FALSE}"; then
as_fn_error $? "conditional \"COND_CUDA11\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${COND_CUDA12_TRUE}" && test -z "${COND_CUDA12_FALSE}"; then
as_fn_error $? "conditional \"COND_CUDA12\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${COND_CUDA_PLUS_TRUE}" && test -z "${COND_CUDA_PLUS_FALSE}"; then
as_fn_error $? "conditional \"COND_CUDA_PLUS\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
Expand Down
4 changes: 3 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ AM_CONDITIONAL([COND_CUDA8], [test x"$want_cuda" = xcuda8])
AM_CONDITIONAL([COND_CUDA9], [test x"$want_cuda" = xcuda9])
AM_CONDITIONAL([COND_CUDA10], [test x"$want_cuda" = xcuda10])
AM_CONDITIONAL([COND_CUDA11], [test x"$want_cuda" = xcuda11])
AM_CONDITIONAL([COND_CUDA12], [test x"$want_cuda" = xcuda12])

# cuda linking for cuda 5x and 6x and 7x and 8x and ..
AM_CONDITIONAL([COND_CUDA_PLUS],
Expand All @@ -100,7 +101,8 @@ AM_CONDITIONAL([COND_CUDA_PLUS],
-o "$want_cuda" = cuda8 \
-o "$want_cuda" = cuda9 \
-o "$want_cuda" = cuda10 \
-o "$want_cuda" = cuda11 \]
-o "$want_cuda" = cuda11 \
-o "$want_cuda" = cuda12 \]
)

###
Expand Down
2 changes: 0 additions & 2 deletions src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@


#ifdef USE_TEXTURES_FIELDS

realw_texture d_displ_tex;
realw_texture d_accel_tex;
// backward/reconstructed
Expand All @@ -59,7 +58,6 @@ template<> __device__ float texfetch_accel<1>(int x) { return tex1Dfetch(d_accel
// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
template<> __device__ float texfetch_displ<3>(int x) { return tex1Dfetch(d_b_displ_tex, x); }
template<> __device__ float texfetch_accel<3>(int x) { return tex1Dfetch(d_b_accel_tex, x); }

#endif

#ifdef USE_TEXTURES_CONSTANTS
Expand Down
23 changes: 18 additions & 5 deletions src/gpu/mesh_constants_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@
//#define USE_TEXTURES_CONSTANTS // might not working properly yet, please test on your card...

#ifdef USE_CUDA
// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
#if CUDA_VERSION < 4000
#undef USE_TEXTURES_FIELDS
#undef USE_TEXTURES_CONSTANTS
#endif
// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
#if CUDA_VERSION < 4000 || (defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 4))
#undef USE_TEXTURES_FIELDS
#undef USE_TEXTURES_CONSTANTS
#endif
#endif

// CUDA compiler specifications
Expand Down Expand Up @@ -236,6 +236,14 @@
#undef USE_LAUNCH_BOUNDS
#endif

#ifdef GPU_DEVICE_Hopper
// specifics see: https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html
// register file size 64k 32-bit registers per SM
// shared memory size 228KB per SM (maximum shared memory, 227KB per thread block)
// maximum registers 255 per thread
#undef USE_LAUNCH_BOUNDS
#endif

/* ----------------------------------------------------------------------------------------------- */

// cuda kernel block size for updating displacements/potential (newmark time scheme)
Expand Down Expand Up @@ -287,7 +295,12 @@
typedef float realw;

// textures
// note: texture templates are supported only for CUDA versions <= 11.x
// since CUDA 12.x, these are deprecated and texture objects should be used instead
// see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS)
typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
#endif

// pointer declarations
// restricted pointers: can improve performance on Kepler ~ 10%
Expand Down
3 changes: 3 additions & 0 deletions src/gpu/rules.mk
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ ifeq ($(CUDA),yes)
ifeq ($(CUDA11),yes)
BUILD_VERSION_TXT += (v11)
endif
ifeq ($(CUDA12),yes)
BUILD_VERSION_TXT += (v12)
endif
endif

BUILD_VERSION_TXT += support
Expand Down

0 comments on commit 89c403d

Please sign in to comment.