Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit f827cf4

Browse files
author
Christos Kotsalos
committed
fixing race condition in cell permute 2 : performance optimization
1 parent 12c7f57 commit f827cf4

File tree

1 file changed

+12
-33
lines changed

1 file changed

+12
-33
lines changed

coreneuron/permute/cellorder.cpp

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -478,26 +478,23 @@ static void bksub_interleaved(NrnThread* nt,
478478
}
479479

480480
// icore ranges [0:warpsize) ; stride[ncycle]
481+
nrn_pragma_acc(routine vector)
481482
static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
482483
int icycle = ncycle - 1;
483484
int istride = stride[icycle];
484485
int i = lastnode - istride + icore;
485-
//#ifndef CORENEURON_ENABLE_GPU
486486
int ii = i;
487-
//#endif
488487

489488
// execute until all tree depths are executed
490489
bool has_subtrees_to_compute = true;
491490

492491
// clang-format off
493492
nrn_pragma_acc(loop seq)
494493
for (; has_subtrees_to_compute; ) { // ncycle loop
495-
//#ifndef CORENEURON_ENABLE_GPU
496494
// serial test, gpu does this in parallel
497-
nrn_pragma_acc(loop)
495+
nrn_pragma_acc(loop vector)
498496
for (int icore = 0; icore < warpsize; ++icore) {
499497
int i = ii + icore;
500-
//#endif
501498
if (icore < istride) { // most efficient if istride equal warpsize
502499
// what is the index
503500
int ip = GPU_PARENT(i);
@@ -509,9 +506,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
509506
nrn_pragma_omp(atomic update)
510507
GPU_RHS(ip) -= p * GPU_RHS(i);
511508
}
512-
//#ifndef CORENEURON_ENABLE_GPU
513509
}
514-
//#endif
515510
// if finished with all tree depths then ready to break
516511
// (note that break is not allowed in OpenACC)
517512
if (icycle == 0) {
@@ -521,52 +516,41 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
521516
--icycle;
522517
istride = stride[icycle];
523518
i -= istride;
524-
//#ifndef CORENEURON_ENABLE_GPU
525519
ii -= istride;
526-
//#endif
527520
}
528-
// clang-format on
529521
}
530522

531523
// icore ranges [0:warpsize) ; stride[ncycle]
524+
nrn_pragma_acc(routine vector)
532525
static void bksub_interleaved2(NrnThread* nt,
533526
int root,
534527
int lastroot,
535528
int icore,
536529
int ncycle,
537530
int* stride,
538531
int firstnode) {
539-
//#ifndef CORENEURON_ENABLE_GPU
532+
nrn_pragma_acc(loop seq)
540533
for (int i = root; i < lastroot; i += 1) {
541-
//#else
542-
// nrn_pragma_acc(loop seq)
543-
// for (int i = root; i < lastroot; i += warpsize) {
544-
//#endif
545534
GPU_RHS(i) /= GPU_D(i); // the root
546535
}
547536

548537
int i = firstnode + icore;
549-
//#ifndef CORENEURON_ENABLE_GPU
550538
int ii = i;
551-
//#endif
539+
nrn_pragma_acc(loop seq)
552540
for (int icycle = 0; icycle < ncycle; ++icycle) {
553541
int istride = stride[icycle];
554-
//#ifndef CORENEURON_ENABLE_GPU
555-
nrn_pragma_acc(loop)
556542
// serial test, gpu does this in parallel
543+
nrn_pragma_acc(loop vector)
557544
for (int icore = 0; icore < warpsize; ++icore) {
558545
int i = ii + icore;
559-
//#endif
560546
if (icore < istride) {
561547
int ip = GPU_PARENT(i);
562548
GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
563549
GPU_RHS(i) /= GPU_D(i);
564550
}
565551
i += istride;
566-
//#ifndef CORENEURON_ENABLE_GPU
567552
}
568553
ii += istride;
569-
//#endif
570554
}
571555
}
572556

@@ -602,15 +586,14 @@ void solve_interleaved2(int ith) {
602586
defined(_OPENACC)
603587
int nstride = stridedispl[nwarp];
604588
#endif
605-
nrn_pragma_acc(parallel loop gang vector vector_length(
606-
warpsize) present(nt [0:1],
589+
nrn_pragma_acc(parallel loop gang present(nt [0:1],
607590
strides [0:nstride],
608591
ncycles [0:nwarp],
609592
stridedispl [0:nwarp + 1],
610593
rootbegin [0:nwarp + 1],
611594
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
612595
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
613-
for (int icore = 0; icore < ncore; ++icore) {
596+
for (int icore = 0; icore < ncore; icore += warpsize) {
614597
int iwarp = icore / warpsize; // figure out the >> value
615598
int ic = icore & (warpsize - 1); // figure out the & mask
616599
int ncycle = ncycles[iwarp];
@@ -619,14 +602,10 @@ void solve_interleaved2(int ith) {
619602
int lastroot = rootbegin[iwarp + 1];
620603
int firstnode = nodebegin[iwarp];
621604
int lastnode = nodebegin[iwarp + 1];
622-
//#ifndef CORENEURON_ENABLE_GPU
623-
if (ic == 0) { // serial test mode. triang and bksub do all cores in warp
624-
//#endif
625-
triang_interleaved2(nt, ic, ncycle, stride, lastnode);
626-
bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
627-
//#ifndef CORENEURON_ENABLE_GPU
628-
} // serial test mode
629-
//#endif
605+
606+
// triang and bksub do all cores in warp
607+
triang_interleaved2(nt, ic, ncycle, stride, lastnode);
608+
bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
630609
}
631610
nrn_pragma_acc(wait(nt->stream_id))
632611
#ifdef _OPENACC

0 commit comments

Comments
 (0)