Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ if (PERFSUITE_ENABLE_WARNINGS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This flag should go in the scripts/lc-builds/XXX files. Also I think there is a architecture agnostic flag for (at least gnu and clang) like -march=native or something... in case the machine has SSE, AVX, AVX2 or AVX512, it will pick the best one.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I saw the lc-builds file and this is where I have it now. Not sure how this slipped in..but thanks for the feedback!

set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests")
set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples")
set(ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises")
Expand Down
1 change: 1 addition & 0 deletions scripts/lc-builds/toss3_icpc19.1.0.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ module load cmake/3.14.5
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-19.1.0/bin/icpc \
-DCMAKE_CXX_FLAGS="-xCORE-AVX2" \
-C ${RAJA_HOSTCONFIG} \
-DENABLE_OPENMP=On \
-DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
Expand Down
19 changes: 19 additions & 0 deletions src/basic/DAXPY-Seq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,25 @@ void DAXPY::runSeqVariant(VariantID vid)

break;
}

case RAJA_Vec : {

startTimer();

for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::forall<RAJA::vector_exec<vector_t>>(RAJA::TypedRangeSegment<I>(ibegin, iend),
[=](VecI i)
{
Y[i] += a * X[i];
});

}
stopTimer();

break;
}

#endif

default : {
Expand Down
8 changes: 7 additions & 1 deletion src/basic/DAXPY.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
#define DAXPY_DATA_SETUP \
Real_ptr x = m_x; \
Real_ptr y = m_y; \
Real_type a = m_a;
Real_type a = m_a; \
RAJA_INDEX_VALUE_T(I, int, "I");\
using vector_t = RAJA::StreamVector<Real_type, 2>;\
using VecI = RAJA::VectorIndex<I, vector_t>;\
RAJA::TypedView<Real_type, RAJA::Layout<1>, I> X(x, getRunSize()); \
RAJA::TypedView<Real_type, RAJA::Layout<1>, I> Y(y, getRunSize()); \


#define DAXPY_BODY \
y[i] += a * x[i] ;
Expand Down
1 change: 1 addition & 0 deletions src/common/KernelBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ void KernelBase::runKernel(VariantID vid)
#if defined(RUN_RAJA_SEQ)
case Lambda_Seq :
case RAJA_Seq :
case RAJA_Vec:
#endif
{
runSeqVariant(vid);
Expand Down
1 change: 1 addition & 0 deletions src/common/RAJAPerfSuite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ static const std::string VariantNames [] =
#if defined(RUN_RAJA_SEQ)
std::string("Lambda_Seq"),
std::string("RAJA_Seq"),
std::string("RAJA_Vec"),
#endif

#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
Expand Down
1 change: 1 addition & 0 deletions src/common/RAJAPerfSuite.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ enum VariantID {
#if defined(RUN_RAJA_SEQ)
Lambda_Seq,
RAJA_Seq,
RAJA_Vec,
#endif

#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
Expand Down
37 changes: 18 additions & 19 deletions src/polybench/POLYBENCH_2MM-OMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,26 +53,25 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)

POLYBENCH_2MM_VIEWS_RAJA;

auto poly_2mm_lam1 = [=](Index_type /*i*/, Index_type /*j*/, Index_type /*k*/, Real_type &dot) {
auto poly_2mm_lam1 = [=](Real_type &dot) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vsrana01 have you checked whether these changes do not adversely affect performance across a range of compilers?

I don't think we want to make changes like this in this PR, since they are orthogonal. We can think about adding additional variants like this later to further stress compilers.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhornung67 These changes may be needed with the latest Lambda changes: if not all segments are in active loops, the Lambda form will static_assert out

POLYBENCH_2MM_BODY1_RAJA;
};
auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k,
Real_type &dot) {
POLYBENCH_2MM_BODY2_RAJA;
};
auto poly_2mm_lam3 = [=](Index_type i, Index_type j, Index_type /*k*/,
auto poly_2mm_lam3 = [=](Index_type i, Index_type j,
Real_type &dot) {
POLYBENCH_2MM_BODY3_RAJA;
};
auto poly_2mm_lam4 = [=](Index_type /*i*/, Index_type /*l*/, Index_type /*j*/,
Real_type &dot) {
auto poly_2mm_lam4 = [=](Real_type &dot) {
POLYBENCH_2MM_BODY4_RAJA;
};
auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j,
Real_type &dot) {
POLYBENCH_2MM_BODY5_RAJA;
};
auto poly_2mm_lam6 = [=](Index_type i, Index_type l, Index_type /*j*/,
auto poly_2mm_lam6 = [=](Index_type i, Index_type l,
Real_type &dot) {
POLYBENCH_2MM_BODY6_RAJA;
};
Expand Down Expand Up @@ -168,23 +167,23 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
RAJA::KernelPolicy<
RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
RAJA::ArgList<0, 1>,
RAJA::statement::Lambda<0>,
RAJA::statement::Lambda<0, RAJA::Params<0>>,
RAJA::statement::For<2, RAJA::loop_exec,
RAJA::statement::Lambda<1>
RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
>,
RAJA::statement::Lambda<2>
RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
>
>;
#else // without collapse...
using EXEC_POL =
RAJA::KernelPolicy<
RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
RAJA::statement::For<1, RAJA::loop_exec,
RAJA::statement::Lambda<0>,
RAJA::statement::Lambda<0, RAJA::Params<0>>,
RAJA::statement::For<2, RAJA::loop_exec,
RAJA::statement::Lambda<1>
RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
>,
RAJA::statement::Lambda<2>
RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
>
>
>;
Expand All @@ -194,21 +193,21 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, ni},
RAJA::RangeSegment{0, nj},
RAJA::RangeSegment{0, nk}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, ni),
RAJA::RangeSegment(0, nj),
RAJA::RangeSegment(0, nk)),
RAJA::tuple<Real_type> {0.0},

poly_2mm_lam1,
poly_2mm_lam2,
poly_2mm_lam3
);

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, ni},
RAJA::RangeSegment{0, nl},
RAJA::RangeSegment{0, nj}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, ni),
RAJA::RangeSegment(0, nl),
RAJA::RangeSegment(0, nj)),
RAJA::tuple<Real_type> {0.0},

poly_2mm_lam4,
poly_2mm_lam5,
Expand Down
47 changes: 22 additions & 25 deletions src/polybench/POLYBENCH_2MM-Seq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,27 +43,24 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)

POLYBENCH_2MM_VIEWS_RAJA;

auto poly_2mm_lam1 = [=](Index_type /*i*/, Index_type /*j*/, Index_type /*k*/, Real_type &dot) {
auto poly_2mm_lam1 = [=](Real_type &dot) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as previous one.

POLYBENCH_2MM_BODY1_RAJA;
};
auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k,
Real_type &dot) {
POLYBENCH_2MM_BODY2_RAJA;
};
auto poly_2mm_lam3 = [=](Index_type i, Index_type j, Index_type /*k*/,
Real_type &dot) {
auto poly_2mm_lam3 = [=](Index_type i, Index_type j, Real_type &dot) {
POLYBENCH_2MM_BODY3_RAJA;
};
auto poly_2mm_lam4 = [=](Index_type /*i*/, Index_type /*l*/, Index_type /*j*/,
Real_type &dot) {
auto poly_2mm_lam4 = [=](Real_type &dot) {
POLYBENCH_2MM_BODY4_RAJA;
};
auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j,
Real_type &dot) {
POLYBENCH_2MM_BODY5_RAJA;
};
auto poly_2mm_lam6 = [=](Index_type i, Index_type l, Index_type /*j*/,
Real_type &dot) {
auto poly_2mm_lam6 = [=](Index_type i, Index_type l, Real_type &dot) {
POLYBENCH_2MM_BODY6_RAJA;
};

Expand Down Expand Up @@ -137,36 +134,36 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)

using EXEC_POL =
RAJA::KernelPolicy<
RAJA::statement::For<0, RAJA::loop_exec,
RAJA::statement::For<1, RAJA::loop_exec,
RAJA::statement::Lambda<0>,
RAJA::statement::For<2, RAJA::loop_exec,
RAJA::statement::Lambda<1>
>,
RAJA::statement::Lambda<2>
RAJA::statement::For<0, RAJA::loop_exec,
RAJA::statement::For<1, RAJA::loop_exec,
RAJA::statement::Lambda<0, RAJA::Params<0>>,
RAJA::statement::For<2, RAJA::loop_exec,
RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>,
RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
>
>
>
>;

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, ni},
RAJA::RangeSegment{0, nj},
RAJA::RangeSegment{0, nk}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, ni),
RAJA::RangeSegment(0, nj),
RAJA::RangeSegment(0, nk)),
RAJA::tuple<Real_type> {0.0},

poly_2mm_lam1,
poly_2mm_lam2,
poly_2mm_lam1,
poly_2mm_lam2,
poly_2mm_lam3
);
);

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, ni},
RAJA::RangeSegment{0, nl},
RAJA::RangeSegment{0, nj}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, ni),
RAJA::RangeSegment(0, nl),
RAJA::RangeSegment(0, nj)),
RAJA::tuple<Real_type> {0.0},

poly_2mm_lam4,
poly_2mm_lam5,
Expand Down
52 changes: 24 additions & 28 deletions src/polybench/POLYBENCH_3MM-OMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,41 +62,37 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)

POLYBENCH_3MM_VIEWS_RAJA;

auto poly_3mm_lam1 = [=] (Index_type /*i*/, Index_type /*j*/, Index_type /*k*/,
Real_type &dot) {
auto poly_3mm_lam1 = [=] ( Real_type &dot) {
POLYBENCH_3MM_BODY1_RAJA;
};
auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k,
Real_type &dot) {
POLYBENCH_3MM_BODY2_RAJA;
};
auto poly_3mm_lam3 = [=] (Index_type i, Index_type j, Index_type /*k*/,
auto poly_3mm_lam3 = [=] (Index_type i, Index_type j,
Real_type &dot) {
POLYBENCH_3MM_BODY3_RAJA;
};

auto poly_3mm_lam4 = [=] (Index_type /*j*/, Index_type /*l*/, Index_type /*m*/,
Real_type &dot) {
auto poly_3mm_lam4 = [=] (Real_type &dot) {
POLYBENCH_3MM_BODY4_RAJA;
};
auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m,
Real_type &dot) {
POLYBENCH_3MM_BODY5_RAJA;
};
auto poly_3mm_lam6 = [=] (Index_type j, Index_type l, Index_type /*m*/,
auto poly_3mm_lam6 = [=] (Index_type j, Index_type l,
Real_type &dot) {
POLYBENCH_3MM_BODY6_RAJA;
};
auto poly_3mm_lam7 = [=] (Index_type /*i*/, Index_type /*l*/, Index_type /*j*/,
Real_type &dot) {
auto poly_3mm_lam7 = [=] (Real_type &dot) {
POLYBENCH_3MM_BODY7_RAJA;
};
auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j,
Real_type &dot) {
POLYBENCH_3MM_BODY8_RAJA;
};
auto poly_3mm_lam9 = [=] (Index_type i, Index_type l, Index_type /*j*/,
Real_type &dot) {
auto poly_3mm_lam9 = [=] (Index_type i, Index_type l, Real_type &dot) {
POLYBENCH_3MM_BODY9_RAJA;
};

Expand Down Expand Up @@ -221,23 +217,23 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
RAJA::KernelPolicy<
RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
RAJA::ArgList<0, 1>,
RAJA::statement::Lambda<0>,
RAJA::statement::Lambda<0, RAJA::Params<0>>,
RAJA::statement::For<2, RAJA::loop_exec,
RAJA::statement::Lambda<1>
RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
>,
RAJA::statement::Lambda<2>
RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
>
>;
#else
using EXEC_POL =
RAJA::KernelPolicy<
RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
RAJA::statement::For<1, RAJA::loop_exec,
RAJA::statement::Lambda<0>,
RAJA::statement::Lambda<0, RAJA::Params<0>>,
RAJA::statement::For<2, RAJA::loop_exec,
RAJA::statement::Lambda<1>
RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
>,
RAJA::statement::Lambda<2>
RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
>
>
>;
Expand All @@ -247,10 +243,10 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, ni},
RAJA::RangeSegment{0, nj},
RAJA::RangeSegment{0, nk}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, ni),
RAJA::RangeSegment(0, nj),
RAJA::RangeSegment(0, nk)),
RAJA::tuple<Real_type> {0.0},

poly_3mm_lam1,
poly_3mm_lam2,
Expand All @@ -259,10 +255,10 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
);

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, nj},
RAJA::RangeSegment{0, nl},
RAJA::RangeSegment{0, nm}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, nj),
RAJA::RangeSegment(0, nl),
RAJA::RangeSegment(0, nm)),
RAJA::tuple<Real_type> {0.0},

poly_3mm_lam4,
poly_3mm_lam5,
Expand All @@ -271,10 +267,10 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
);

RAJA::kernel_param<EXEC_POL>(
RAJA::make_tuple(RAJA::RangeSegment{0, ni},
RAJA::RangeSegment{0, nl},
RAJA::RangeSegment{0, nj}),
RAJA::make_tuple(static_cast<Real_type>(0.0)),
RAJA::make_tuple(RAJA::RangeSegment(0, ni),
RAJA::RangeSegment(0, nl),
RAJA::RangeSegment(0, nj)),
RAJA::tuple<Real_type>{0.0},

poly_3mm_lam7,
poly_3mm_lam8,
Expand Down
Loading