Skip to content

Commit 78bffd4

Browse files
Add GPU-Direct communications for 3D
Also adds gpu-direct for vector 2d
1 parent fdd6cfc commit 78bffd4

9 files changed

+359
-34
lines changed

src/gpu/SELF_DomainDecomposition.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ extern "C"
1010

1111
#if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM
1212
gpuaware = (int) MPIX_Query_rocm_support();
13+
printf("Query rocm support");
1314
#endif
1415

1516
#if defined(OMPI_HAVE_MPI_EXT_CUDA) && OMPI_HAVE_MPI_EXT_CUDA

src/gpu/SELF_DomainDecomposition.f90

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ subroutine Init_DomainDecomposition(this,enableMPI)
7575
if(check_gpu_aware_support() == 0) then
7676
print*,__FILE__" : Error! GPU Aware support is not detected. Stopping."
7777
call MPI_FINALIZE(ierror)
78-
stop
78+
stop 1
7979
endif
8080

8181
call MPI_COMM_RANK(this%mpiComm,this%rankId,ierror)

src/gpu/SELF_GPUInterfaces.f90

+21
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,17 @@ subroutine SideExchange_2D_gpu(extboundary,boundary,sideinfo,elemToRank,rankid,o
108108
endsubroutine SideExchange_2D_gpu
109109
endinterface
110110

111+
interface
112+
subroutine ApplyFlip_2D_gpu(extBoundary,sideInfo,elemToRank,rankId,offset,N,nVar,nEl) &
113+
bind(c,name="ApplyFlip_2D_gpu")
114+
use iso_c_binding
115+
implicit none
116+
type(c_ptr),value :: extBoundary,sideInfo,elemToRank
117+
integer(c_int),value :: rankId,offset,N,nVar,nEl
118+
endsubroutine ApplyFlip_2D_gpu
119+
endinterface
120+
121+
111122
interface
112123
subroutine DG_BoundaryContribution_2D_gpu(bmatrix,qweights,bf,df,N,nvar,nel) &
113124
bind(c,name="DG_BoundaryContribution_2D_gpu")
@@ -137,6 +148,16 @@ subroutine SideExchange_3D_gpu(extboundary,boundary,sideinfo,elemToRank,rankid,o
137148
endsubroutine SideExchange_3D_gpu
138149
endinterface
139150

151+
interface
152+
subroutine ApplyFlip_3D_gpu(extBoundary,sideInfo,elemToRank,rankId,offset,N,nVar,nEl) &
153+
bind(c,name="ApplyFlip_3D_gpu")
154+
use iso_c_binding
155+
implicit none
156+
type(c_ptr),value :: extBoundary,sideInfo,elemToRank
157+
integer(c_int),value :: rankId,offset,N,nVar,nEl
158+
endsubroutine ApplyFlip_3D_gpu
159+
endinterface
160+
140161
interface
141162
subroutine DG_BoundaryContribution_3D_gpu(bmatrix,qweights,bf,df,N,nvar,nel) &
142163
bind(c,name="DG_BoundaryContribution_3D_gpu")

src/gpu/SELF_MappedData.cpp

+107-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ __global__ void SideExchange_3D(real *extBoundary, real *boundary, int *sideInfo
168168
uint32_t idof = threadIdx.x + blockIdx.x*blockDim.x;
169169
uint32_t ndof = (N+1)*(N+1)*nEl*6;
170170

171-
172171
if(idof < ndof){
173172

174173
uint32_t s1 = (idof/(N+1)/(N+1)) % 6;
@@ -245,6 +244,113 @@ extern "C"
245244
}
246245
}
247246

247+
__global__ void ApplyFlip_3D(real *extBoundary, int *sideInfo, int *elemToRank, int rankId, int offset, int N, int nVar, int nEl){
248+
249+
uint32_t idof = threadIdx.x + blockIdx.x*blockDim.x;
250+
uint32_t ndof = nVar*nEl*6;
251+
uint32_t s1 = (idof) % 6;
252+
uint32_t e1 = (idof/6) % nEl;
253+
uint32_t ivar = idof/6/nEl;
254+
255+
if(idof < ndof){
256+
int e2Global = sideInfo[INDEX3(2,s1,e1,5,4)];
257+
int e2 = e2Global - offset;
258+
int s2 = sideInfo[INDEX3(3,s1,e1,5,4)]/10;
259+
int flip = sideInfo[INDEX3(3,s1,e1,5,4)]-s2*10;
260+
real buff[81]; // warning : set fixed buffer size for applying flip. This limits the polynomial degree to 8 [ (N+1)^2 <= 81 ]
261+
262+
if(e2Global != 0){
263+
int neighborRank = elemToRank[e2Global-1];
264+
if( neighborRank != rankId ){
265+
266+
267+
if(flip == 1){
268+
for( int j1 = 0; j1<N+1; j1++){
269+
for( int i1 = 0; i1<N+1; i1++){
270+
int i2 = N-i1;
271+
int j2 = j1;
272+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
273+
}
274+
}
275+
}
276+
else if(flip == 2){
277+
for( int j1 = 0; j1<N+1; j1++){
278+
for( int i1 = 0; i1<N+1; i1++){
279+
int i2 = N-i1;
280+
int j2 = N-j1;
281+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
282+
}
283+
}
284+
}
285+
else if(flip == 3){
286+
for( int j1 = 0; j1<N+1; j1++){
287+
for( int i1 = 0; i1<N+1; i1++){
288+
int i2 = i1;
289+
int j2 = N-j1;
290+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
291+
}
292+
}
293+
}
294+
else if(flip == 4){
295+
for( int j1 = 0; j1<N+1; j1++){
296+
for( int i1 = 0; i1<N+1; i1++){
297+
int i2 = j1;
298+
int j2 = i1;
299+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
300+
}
301+
}
302+
}
303+
else if(flip == 5){
304+
for( int j1 = 0; j1<N+1; j1++){
305+
for( int i1 = 0; i1<N+1; i1++){
306+
int i2 = N-j1;
307+
int j2 = i1;
308+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
309+
}
310+
} }
311+
else if(flip == 6){
312+
for( int j1 = 0; j1<N+1; j1++){
313+
for( int i1 = 0; i1<N+1; i1++){
314+
int i2 = N-j1;
315+
int j2 = N-i1;
316+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
317+
}
318+
} }
319+
else if(flip == 7){
320+
for( int j1 = 0; j1<N+1; j1++){
321+
for( int i1 = 0; i1<N+1; i1++){
322+
int i2 = j1;
323+
int j2 = N-i1;
324+
buff[i1+(N+1)*j1] = extBoundary[SCB_3D_INDEX(i2,j2,s1-1,e1-1,ivar,N,nEl)];
325+
}
326+
}
327+
}
328+
for( int j1 = 0; j1<N+1; j1++){
329+
for( int i1 = 0; i1<N+1; i1++){
330+
extBoundary[SCB_3D_INDEX(i1,j1,s1-1,e1-1,ivar,N,nEl)] = buff[i1+(N+1)*j1];
331+
}
332+
}
333+
334+
}
335+
}
336+
}
337+
338+
}
339+
340+
extern "C"
341+
{
342+
void ApplyFlip_3D_gpu(real *extBoundary, int *sideInfo, int *elemToRank, int rankId, int offset, int N, int nVar, int nEl)
343+
{
344+
int ndof = 6*nEl*nVar;
345+
int threads_per_block = 256;
346+
int nblocks_x = ndof/threads_per_block + 1;
347+
348+
dim3 nblocks(nblocks_x,1,1);
349+
dim3 nthreads(threads_per_block,1,1);
350+
ApplyFlip_3D<<<nblocks,nthreads>>>(extBoundary, sideInfo, elemToRank, rankId, offset, N, nVar, nEl);
351+
}
352+
}
353+
248354
__global__ void ContravariantWeight_gpukernel(real *scalar, real *dsdx, real *tensor, int ndof){
249355

250356
uint32_t ivar = blockIdx.y; // variable dimension

src/gpu/SELF_MappedScalar_2D.f90

-10
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,6 @@ module SELF_MappedScalar_2D
5454

5555
endtype MappedScalar2D
5656

57-
interface
58-
subroutine ApplyFlip_2D_gpu(extBoundary,sideInfo,elemToRank,rankId,offset,N,nVar,nEl) &
59-
bind(c,name="ApplyFlip_2D_gpu")
60-
use iso_c_binding
61-
implicit none
62-
type(c_ptr),value :: extBoundary,sideInfo,elemToRank
63-
integer(c_int),value :: rankId,offset,N,nVar,nEl
64-
endsubroutine ApplyFlip_2D_gpu
65-
endinterface
66-
6757
interface
6858
subroutine ContravariantWeight_2D_gpu(f,dsdx,jaf,N,nvar,nel) &
6959
bind(c,name="ContravariantWeight_2D_gpu")

src/gpu/SELF_MappedScalar_3D.f90

+73-5
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ module SELF_MappedScalar_3D
4444
procedure,public :: SetInteriorFromEquation => SetInteriorFromEquation_MappedScalar3D
4545

4646
procedure,public :: SideExchange => SideExchange_MappedScalar3D
47+
procedure,private :: MPIExchangeAsync => MPIExchangeAsync_MappedScalar3D
4748

4849
generic,public :: MappedGradient => MappedGradient_MappedScalar3D
4950
procedure,private :: MappedGradient_MappedScalar3D
@@ -187,6 +188,68 @@ subroutine SetInteriorFromEquation_MappedScalar3D(this,geometry,time)
187188

188189
endsubroutine SetInteriorFromEquation_MappedScalar3D
189190

191+
subroutine MPIExchangeAsync_MappedScalar3D(this,mesh,resetCount)
192+
implicit none
193+
class(MappedScalar3D),intent(inout) :: this
194+
type(Mesh3D),intent(inout) :: mesh
195+
logical,intent(in) :: resetCount
196+
! Local
197+
integer :: e1,s1,e2,s2,ivar
198+
integer :: globalSideId,r2,tag
199+
integer :: iError
200+
integer :: msgCount
201+
real(prec),pointer :: boundary(:,:,:,:,:)
202+
real(prec),pointer :: extboundary(:,:,:,:,:)
203+
204+
if(resetCount) then
205+
msgCount = 0
206+
else
207+
msgCount = mesh%decomp%msgCount
208+
endif
209+
call c_f_pointer(this%boundary_gpu,boundary,[this%interp%N+1,this%interp%N+1,6,this%nelem,this%nvar])
210+
call c_f_pointer(this%extboundary_gpu,extboundary,[this%interp%N+1,this%interp%N+1,6,this%nelem,this%nvar])
211+
212+
do ivar = 1,this%nvar
213+
do e1 = 1,this%nElem
214+
do s1 = 1,6
215+
216+
e2 = mesh%sideInfo(3,s1,e1) ! Neighbor Element
217+
if(e2 > 0) then
218+
r2 = mesh%decomp%elemToRank(e2) ! Neighbor Rank
219+
220+
if(r2 /= mesh%decomp%rankId) then
221+
222+
s2 = mesh%sideInfo(4,s1,e1)/10
223+
globalSideId = abs(mesh%sideInfo(2,s1,e1))
224+
! create unique tag for each side and each variable
225+
tag = globalsideid+mesh%nUniqueSides*(ivar-1)
226+
227+
msgCount = msgCount+1
228+
call MPI_IRECV(extBoundary(:,:,s1,e1,ivar), &
229+
(this%interp%N+1)*(this%interp%N+1), &
230+
mesh%decomp%mpiPrec, &
231+
r2,tag, &
232+
mesh%decomp%mpiComm, &
233+
mesh%decomp%requests(msgCount),iError)
234+
235+
msgCount = msgCount+1
236+
call MPI_ISEND(boundary(:,:,s1,e1,ivar), &
237+
(this%interp%N+1)*(this%interp%N+1), &
238+
mesh%decomp%mpiPrec, &
239+
r2,tag, &
240+
mesh%decomp%mpiComm, &
241+
mesh%decomp%requests(msgCount),iError)
242+
endif
243+
endif
244+
245+
enddo
246+
enddo
247+
enddo
248+
249+
mesh%decomp%msgCount = msgCount
250+
251+
endsubroutine MPIExchangeAsync_MappedScalar3D
252+
190253
subroutine SideExchange_MappedScalar3D(this,mesh)
191254
implicit none
192255
class(MappedScalar3D),intent(inout) :: this
@@ -196,16 +259,21 @@ subroutine SideExchange_MappedScalar3D(this,mesh)
196259

197260
offset = mesh%decomp%offsetElem(mesh%decomp%rankId+1)
198261

199-
!call this%MPIExchangeAsync(mesh%decomp,mesh,resetCount=.true.)
262+
if(mesh%decomp%mpiEnabled) then
263+
call this%MPIExchangeAsync(mesh,resetCount=.true.)
264+
endif
200265

201266
call SideExchange_3D_gpu(this%extboundary_gpu, &
202267
this%boundary_gpu,mesh%sideinfo_gpu,mesh%decomp%elemToRank_gpu, &
203268
mesh%decomp%rankid,offset,this%interp%N,this%nvar,this%nelem)
204269

205-
!call decomp%FinalizeMPIExchangeAsync()
206-
207-
! Apply side flips for data exchanged with MPI
208-
!call this%ApplyFlip(decomp,mesh)
270+
if(mesh%decomp%mpiEnabled) then
271+
call mesh%decomp%FinalizeMPIExchangeAsync()
272+
! Apply side flips for data exchanged with MPI
273+
call ApplyFlip_3D_gpu(this%extboundary_gpu,mesh%sideInfo_gpu, &
274+
mesh%decomp%elemToRank_gpu,mesh%decomp%rankId, &
275+
offset,this%interp%N,this%nVar,this%nElem)
276+
endif
209277

210278
endsubroutine SideExchange_MappedScalar3D
211279

0 commit comments

Comments
 (0)