@@ -478,26 +478,23 @@ static void bksub_interleaved(NrnThread* nt,
478
478
}
479
479
480
480
// icore ranges [0:warpsize) ; stride[ncycle]
481
+ nrn_pragma_acc (routine vector)
481
482
static void triang_interleaved2 (NrnThread* nt, int icore, int ncycle, int * stride, int lastnode) {
482
483
int icycle = ncycle - 1 ;
483
484
int istride = stride[icycle];
484
485
int i = lastnode - istride + icore;
485
- // #ifndef CORENEURON_ENABLE_GPU
486
486
int ii = i;
487
- // #endif
488
487
489
488
// execute until all tree depths are executed
490
489
bool has_subtrees_to_compute = true ;
491
490
492
491
// clang-format off
493
492
nrn_pragma_acc (loop seq)
494
493
for (; has_subtrees_to_compute; ) { // ncycle loop
495
- // #ifndef CORENEURON_ENABLE_GPU
496
494
// serial test, gpu does this in parallel
497
- nrn_pragma_acc (loop)
495
+ nrn_pragma_acc (loop vector )
498
496
for (int icore = 0 ; icore < warpsize; ++icore) {
499
497
int i = ii + icore;
500
- // #endif
501
498
if (icore < istride) { // most efficient if istride equal warpsize
502
499
// what is the index
503
500
int ip = GPU_PARENT (i);
@@ -509,9 +506,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
509
506
nrn_pragma_omp (atomic update)
510
507
GPU_RHS (ip) -= p * GPU_RHS (i);
511
508
}
512
- // #ifndef CORENEURON_ENABLE_GPU
513
509
}
514
- // #endif
515
510
// if finished with all tree depths then ready to break
516
511
// (note that break is not allowed in OpenACC)
517
512
if (icycle == 0 ) {
@@ -521,52 +516,41 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
521
516
--icycle;
522
517
istride = stride[icycle];
523
518
i -= istride;
524
- // #ifndef CORENEURON_ENABLE_GPU
525
519
ii -= istride;
526
- // #endif
527
520
}
528
- // clang-format on
529
521
}
530
522
531
523
// icore ranges [0:warpsize) ; stride[ncycle]
524
+ nrn_pragma_acc (routine vector)
532
525
static void bksub_interleaved2 (NrnThread* nt,
533
526
int root,
534
527
int lastroot,
535
528
int icore,
536
529
int ncycle,
537
530
int * stride,
538
531
int firstnode) {
539
- // #ifndef CORENEURON_ENABLE_GPU
532
+ nrn_pragma_acc (loop seq)
540
533
for (int i = root; i < lastroot; i += 1 ) {
541
- // #else
542
- // nrn_pragma_acc(loop seq)
543
- // for (int i = root; i < lastroot; i += warpsize) {
544
- // #endif
545
534
GPU_RHS (i) /= GPU_D (i); // the root
546
535
}
547
536
548
537
int i = firstnode + icore;
549
- // #ifndef CORENEURON_ENABLE_GPU
550
538
int ii = i;
551
- // #endif
539
+ nrn_pragma_acc (loop seq)
552
540
for (int icycle = 0 ; icycle < ncycle; ++icycle) {
553
541
int istride = stride[icycle];
554
- // #ifndef CORENEURON_ENABLE_GPU
555
- nrn_pragma_acc (loop)
556
542
// serial test, gpu does this in parallel
543
+ nrn_pragma_acc (loop vector)
557
544
for (int icore = 0 ; icore < warpsize; ++icore) {
558
545
int i = ii + icore;
559
- // #endif
560
546
if (icore < istride) {
561
547
int ip = GPU_PARENT (i);
562
548
GPU_RHS (i) -= GPU_B (i) * GPU_RHS (ip);
563
549
GPU_RHS (i) /= GPU_D (i);
564
550
}
565
551
i += istride;
566
- // #ifndef CORENEURON_ENABLE_GPU
567
552
}
568
553
ii += istride;
569
- // #endif
570
554
}
571
555
}
572
556
@@ -602,15 +586,14 @@ void solve_interleaved2(int ith) {
602
586
defined (_OPENACC)
603
587
int nstride = stridedispl[nwarp];
604
588
#endif
605
- nrn_pragma_acc (parallel loop gang vector vector_length (
606
- warpsize) present (nt [0 :1 ],
589
+ nrn_pragma_acc (parallel loop gang present (nt [0 :1 ],
607
590
strides [0 :nstride],
608
591
ncycles [0 :nwarp],
609
592
stridedispl [0 :nwarp + 1 ],
610
593
rootbegin [0 :nwarp + 1 ],
611
594
nodebegin [0 :nwarp + 1 ]) if (nt->compute_gpu ) async (nt->stream_id ))
612
595
nrn_pragma_omp (target teams distribute parallel for simd if (nt->compute_gpu ))
613
- for (int icore = 0 ; icore < ncore; ++ icore) {
596
+ for (int icore = 0 ; icore < ncore; icore += warpsize ) {
614
597
int iwarp = icore / warpsize; // figure out the >> value
615
598
int ic = icore & (warpsize - 1 ); // figure out the & mask
616
599
int ncycle = ncycles[iwarp];
@@ -619,14 +602,10 @@ void solve_interleaved2(int ith) {
619
602
int lastroot = rootbegin[iwarp + 1 ];
620
603
int firstnode = nodebegin[iwarp];
621
604
int lastnode = nodebegin[iwarp + 1 ];
622
- // #ifndef CORENEURON_ENABLE_GPU
623
- if (ic == 0 ) { // serial test mode. triang and bksub do all cores in warp
624
- // #endif
625
- triang_interleaved2 (nt, ic, ncycle, stride, lastnode);
626
- bksub_interleaved2 (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
627
- // #ifndef CORENEURON_ENABLE_GPU
628
- } // serial test mode
629
- // #endif
605
+
606
+ // triang and bksub do all cores in warp
607
+ triang_interleaved2 (nt, ic, ncycle, stride, lastnode);
608
+ bksub_interleaved2 (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
630
609
}
631
610
nrn_pragma_acc (wait (nt->stream_id ))
632
611
#ifdef _OPENACC
0 commit comments