Add GPU-Direct communications for 3D

fluidnumerics-joe · fluidnumerics-joe · commit 78bffd403e83 · 2024-09-25T00:48:18.000Z
Also adds gpu-direct for vector 2d
diff --git a/src/gpu/SELF_DomainDecomposition.cpp b/src/gpu/SELF_DomainDecomposition.cpp
@@ -10,6 +10,7 @@ extern "C"
 
 #if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM
         gpuaware = (int) MPIX_Query_rocm_support();
+        printf("Query rocm support");
 #endif
 
 #if defined(OMPI_HAVE_MPI_EXT_CUDA) && OMPI_HAVE_MPI_EXT_CUDA
diff --git a/src/gpu/SELF_DomainDecomposition.f90 b/src/gpu/SELF_DomainDecomposition.f90
@@ -75,7 +75,7 @@ subroutine Init_DomainDecomposition(this,enableMPI)
       if(check_gpu_aware_support() == 0) then
         print*,__FILE__" : Error! GPU Aware support is not detected. Stopping."
         call MPI_FINALIZE(ierror)
-        stop
+        stop 1
       endif
 
       call MPI_COMM_RANK(this%mpiComm,this%rankId,ierror)
diff --git a/src/gpu/SELF_GPUInterfaces.f90 b/src/gpu/SELF_GPUInterfaces.f90
@@ -108,6 +108,17 @@ subroutine SideExchange_2D_gpu(extboundary,boundary,sideinfo,elemToRank,rankid,o
     endsubroutine SideExchange_2D_gpu
   endinterface
 
+  interface
+    subroutine ApplyFlip_2D_gpu(extBoundary,sideInfo,elemToRank,rankId,offset,N,nVar,nEl) &
+      bind(c,name="ApplyFlip_2D_gpu")
+      use iso_c_binding
+      implicit none
+      type(c_ptr),value :: extBoundary,sideInfo,elemToRank
+      integer(c_int),value :: rankId,offset,N,nVar,nEl
+    endsubroutine ApplyFlip_2D_gpu
+  endinterface
+
+
   interface
     subroutine DG_BoundaryContribution_2D_gpu(bmatrix,qweights,bf,df,N,nvar,nel) &
       bind(c,name="DG_BoundaryContribution_2D_gpu")
@@ -137,6 +148,16 @@ subroutine SideExchange_3D_gpu(extboundary,boundary,sideinfo,elemToRank,rankid,o
     endsubroutine SideExchange_3D_gpu
   endinterface
 
+  interface
+    subroutine ApplyFlip_3D_gpu(extBoundary,sideInfo,elemToRank,rankId,offset,N,nVar,nEl) &
+      bind(c,name="ApplyFlip_3D_gpu")
+      use iso_c_binding
+      implicit none
+      type(c_ptr),value :: extBoundary,sideInfo,elemToRank
+      integer(c_int),value :: rankId,offset,N,nVar,nEl
+    endsubroutine ApplyFlip_3D_gpu
+  endinterface
+
   interface
     subroutine DG_BoundaryContribution_3D_gpu(bmatrix,qweights,bf,df,N,nvar,nel) &
       bind(c,name="DG_BoundaryContribution_3D_gpu")
diff --git a/src/gpu/SELF_MappedData.cpp b/src/gpu/SELF_MappedData.cpp
@@ -168,7 +168,6 @@ __global__ void SideExchange_3D(real *extBoundary, real *boundary, int *sideInfo
   uint32_t idof = threadIdx.x + blockIdx.x*blockDim.x;
   uint32_t ndof = (N+1)*(N+1)*nEl*6;
 
-  
   if(idof < ndof){
 
     uint32_t s1 = (idof/(N+1)/(N+1)) % 6;
@@ -245,6 +244,113 @@ extern "C"
   }
 }
 
+__global__ void ApplyFlip_3D(real *extBoundary, int *sideInfo, int *elemToRank, int rankId, int offset, int N, int nVar, int nEl){
+
+  uint32_t idof = threadIdx.x + blockIdx.x*blockDim.x;
+  uint32_t ndof = nVar*nEl*6;
+  uint32_t s1 = (idof) % 6;
+  uint32_t e1 = (idof/6) % nEl;
+  uint32_t ivar = idof/6/nEl;
+  
+  if(idof < ndof){
+    int e2Global = sideInfo[INDEX3(2,s1,e1,5,4)];
+    int e2 = e2Global - offset;
+    int s2 = sideInfo[INDEX3(3,s1,e1,5,4)]/10;
+    int flip = sideInfo[INDEX3(3,s1,e1,5,4)]-s2*10;
+    real buff[81]; // warning : set fixed buffer size for applying flip. This limits the polynomial degree to 8 [ (N+1)^2 <= 81 ]
+
+    if(e2Global != 0){
+      int neighborRank = elemToRank[e2Global-1];
+      if( neighborRank != rankId ){
+
+
+        if(flip == 1){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = N-i1;
+              int j2 = j1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }
+        }
+        else if(flip == 2){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = N-i1;
+              int j2 = N-j1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }
+        }
+        else if(flip == 3){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = i1;
+              int j2 = N-j1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }
+        }
+        else if(flip == 4){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = j1;
+              int j2 = i1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }
+        }
+        else if(flip == 5){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = N-j1;
+              int j2 = i1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }        }
+        else if(flip == 6){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = N-j1;
+              int j2 = N-i1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }         }
+        else if(flip == 7){
+          for( int j1 = 0; j1<N+1; j1++){
+            for( int i1 = 0; i1<N+1; i1++){
+              int i2 = j1;
+              int j2 = N-i1;
+              buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
+            }
+          }         
+        }
+        for( int j1 = 0; j1<N+1; j1++){
+          for( int i1 = 0; i1<N+1; i1++){
+            extBoundary[SCB_3D_INDEX(i1,j1,s1-1,e1-1,ivar,N,nEl)] = buff[i1+(N+1)*j1];
+          }
+        }
+
+      }
+    }
+  }
+  
+}
+
+extern "C"
+{
+  void ApplyFlip_3D_gpu(real *extBoundary, int *sideInfo, int *elemToRank, int rankId, int offset, int N, int nVar, int nEl)
+  {
+    int ndof = 6*nEl*nVar;
+    int threads_per_block = 256;
+    int nblocks_x = ndof/threads_per_block + 1;
+
+    dim3 nblocks(nblocks_x,1,1);
+    dim3 nthreads(threads_per_block,1,1);
+    ApplyFlip_3D<<<nblocks,nthreads>>>(extBoundary, sideInfo, elemToRank, rankId, offset, N, nVar, nEl);
+  }
+}
+
 __global__ void ContravariantWeight_gpukernel(real *scalar, real *dsdx, real *tensor, int ndof){
 
   uint32_t ivar = blockIdx.y; // variable dimension
diff --git a/src/gpu/SELF_MappedScalar_2D.f90 b/src/gpu/SELF_MappedScalar_2D.f90
@@ -54,16 +54,6 @@ module SELF_MappedScalar_2D
 
   endtype MappedScalar2D
 
-  interface
-    subroutine ApplyFlip_2D_gpu(extBoundary,sideInfo,elemToRank,rankId,offset,N,nVar,nEl) &
-      bind(c,name="ApplyFlip_2D_gpu")
-      use iso_c_binding
-      implicit none
-      type(c_ptr),value :: extBoundary,sideInfo,elemToRank
-      integer(c_int),value :: rankId,offset,N,nVar,nEl
-    endsubroutine ApplyFlip_2D_gpu
-  endinterface
-
   interface
     subroutine ContravariantWeight_2D_gpu(f,dsdx,jaf,N,nvar,nel) &
       bind(c,name="ContravariantWeight_2D_gpu")
diff --git a/src/gpu/SELF_MappedScalar_3D.f90 b/src/gpu/SELF_MappedScalar_3D.f90
@@ -44,6 +44,7 @@ module SELF_MappedScalar_3D
     procedure,public :: SetInteriorFromEquation => SetInteriorFromEquation_MappedScalar3D
 
     procedure,public :: SideExchange => SideExchange_MappedScalar3D
+    procedure,private :: MPIExchangeAsync => MPIExchangeAsync_MappedScalar3D
 
     generic,public :: MappedGradient => MappedGradient_MappedScalar3D
     procedure,private :: MappedGradient_MappedScalar3D
@@ -187,6 +188,68 @@ subroutine SetInteriorFromEquation_MappedScalar3D(this,geometry,time)
 
   endsubroutine SetInteriorFromEquation_MappedScalar3D
 
+  subroutine MPIExchangeAsync_MappedScalar3D(this,mesh,resetCount)
+    implicit none
+    class(MappedScalar3D),intent(inout) :: this
+    type(Mesh3D),intent(inout) :: mesh
+    logical,intent(in) :: resetCount
+    ! Local
+    integer :: e1,s1,e2,s2,ivar
+    integer :: globalSideId,r2,tag
+    integer :: iError
+    integer :: msgCount
+    real(prec),pointer :: boundary(:,:,:,:,:)
+    real(prec),pointer :: extboundary(:,:,:,:,:)
+
+    if(resetCount) then
+      msgCount = 0
+    else
+      msgCount = mesh%decomp%msgCount
+    endif
+    call c_f_pointer(this%boundary_gpu,boundary,[this%interp%N+1,this%interp%N+1,6,this%nelem,this%nvar])
+    call c_f_pointer(this%extboundary_gpu,extboundary,[this%interp%N+1,this%interp%N+1,6,this%nelem,this%nvar])
+
+    do ivar = 1,this%nvar
+      do e1 = 1,this%nElem
+        do s1 = 1,6
+
+          e2 = mesh%sideInfo(3,s1,e1) ! Neighbor Element
+          if(e2 > 0) then
+            r2 = mesh%decomp%elemToRank(e2) ! Neighbor Rank
+
+            if(r2 /= mesh%decomp%rankId) then
+
+              s2 = mesh%sideInfo(4,s1,e1)/10
+              globalSideId = abs(mesh%sideInfo(2,s1,e1))
+              ! create unique tag for each side and each variable
+              tag = globalsideid+mesh%nUniqueSides*(ivar-1)
+
+              msgCount = msgCount+1
+              call MPI_IRECV(extBoundary(:,:,s1,e1,ivar), &
+                             (this%interp%N+1)*(this%interp%N+1), &
+                             mesh%decomp%mpiPrec, &
+                             r2,tag, &
+                             mesh%decomp%mpiComm, &
+                             mesh%decomp%requests(msgCount),iError)
+
+              msgCount = msgCount+1
+              call MPI_ISEND(boundary(:,:,s1,e1,ivar), &
+                             (this%interp%N+1)*(this%interp%N+1), &
+                             mesh%decomp%mpiPrec, &
+                             r2,tag, &
+                             mesh%decomp%mpiComm, &
+                             mesh%decomp%requests(msgCount),iError)
+            endif
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    mesh%decomp%msgCount = msgCount
+
+  endsubroutine MPIExchangeAsync_MappedScalar3D
+
   subroutine SideExchange_MappedScalar3D(this,mesh)
     implicit none
     class(MappedScalar3D),intent(inout) :: this
@@ -196,16 +259,21 @@ subroutine SideExchange_MappedScalar3D(this,mesh)
 
     offset = mesh%decomp%offsetElem(mesh%decomp%rankId+1)
 
-    !call this%MPIExchangeAsync(mesh%decomp,mesh,resetCount=.true.)
+    if(mesh%decomp%mpiEnabled) then
+      call this%MPIExchangeAsync(mesh,resetCount=.true.)
+    endif
 
     call SideExchange_3D_gpu(this%extboundary_gpu, &
                              this%boundary_gpu,mesh%sideinfo_gpu,mesh%decomp%elemToRank_gpu, &
                              mesh%decomp%rankid,offset,this%interp%N,this%nvar,this%nelem)
 
-    !call decomp%FinalizeMPIExchangeAsync()
-
-    ! Apply side flips for data exchanged with MPI
-    !call this%ApplyFlip(decomp,mesh)
+    if(mesh%decomp%mpiEnabled) then
+      call mesh%decomp%FinalizeMPIExchangeAsync()
+      ! Apply side flips for data exchanged with MPI
+      call ApplyFlip_3D_gpu(this%extboundary_gpu,mesh%sideInfo_gpu, &
+                            mesh%decomp%elemToRank_gpu,mesh%decomp%rankId, &
+                            offset,this%interp%N,this%nVar,this%nElem)
+    endif
 
   endsubroutine SideExchange_MappedScalar3D
 
diff --git a/src/gpu/SELF_MappedVector_2D.f90 b/src/gpu/SELF_MappedVector_2D.f90
diff --git a/src/gpu/SELF_MappedVector_3D.f90 b/src/gpu/SELF_MappedVector_3D.f90
diff --git a/src/gpu/SELF_Mesh_2D.f90 b/src/gpu/SELF_Mesh_2D.f90