From 41bcd8e4d38c7aba0311a42760edf56e90deb268 Mon Sep 17 00:00:00 2001 From: Lukas Dreyer Date: Tue, 21 Oct 2025 12:27:26 +0200 Subject: [PATCH 1/3] add pragma unrolls --- .../t8_standalone_implementation.hxx | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx b/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx index af81a60b4a..81d093ce22 100644 --- a/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx +++ b/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx @@ -376,6 +376,7 @@ struct t8_standalone_scheme const t8_standalone_element *el2 = (const t8_standalone_element *) elem2; if (el1->level != el2->level) return 0; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { if (el1->coords[idim] != el2->coords[idim]) return 0; @@ -411,6 +412,7 @@ struct t8_standalone_scheme { t8_standalone_element *el = (t8_standalone_element *) elem; el->level = 0; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { el->coords[idim] = 0; } @@ -652,6 +654,7 @@ struct t8_standalone_scheme T8_ASSERT (0 <= el->level && el->level <= T8_ELEMENT_MAXLEVEL[TEclass]); int cube_id = 0; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { cube_id |= (el->coords[idim] & length) ? (1 << idim) : 0; } @@ -772,6 +775,7 @@ struct t8_standalone_scheme /* Shift the coords to the eighth cube. The type of the last descendant * is the type of the input element */ t8_element_coord coord_offset = element_get_len (el->level) - element_get_len (level); +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { d->coords[idim] |= coord_offset; } @@ -981,6 +985,7 @@ struct t8_standalone_scheme last_descendant->type = el->type; /**TODO: Check if this is always true! */ } +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { int multiplier = 1; if constexpr (!T8_ELEMENT_NUM_EQUATIONS[TEclass]) { @@ -1019,6 +1024,7 @@ struct t8_standalone_scheme } // all edges containing dim must be fulfilled with x_d-a_d >= x_j-a_j or x_j-a_j <= x_d-a_d if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#pragma GCC unroll 4 for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ++ieq) { if ((t8_type_edge_equations[ieq][0] == dim && get_typebit (el->type, ieq)) || (t8_type_edge_equations[ieq][1] == dim && !get_typebit (el->type, ieq))) { @@ -1035,6 +1041,7 @@ struct t8_standalone_scheme } // all edges containing dimid must be fulfilled with x_d-a_d <= x_j-a_j or x_j-a_j >= x_d-a_d if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#pragma GCC unroll 4 for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { if ((t8_type_edge_equations[ieq][0] == dim && !get_typebit (el->type, ieq)) || (t8_type_edge_equations[ieq][1] == dim && get_typebit (el->type, ieq))) { @@ -1146,6 +1153,7 @@ struct t8_standalone_scheme /**Adapt typebits*/ if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#pragma GCC unroll 4 for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { /**For all neighboring typebits, change typebit*/ if (t8_type_edge_equations[ieq][0] == facenormal_dim @@ -1517,6 +1525,7 @@ struct t8_standalone_scheme int coords_int[T8_ELEMENT_DIM[TEclass]]; T8_ASSERT (0 <= vertex && vertex < T8_ELEMENT_NUM_CORNERS[TEclass]); element_compute_coords (el, vertex, coords_int); +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { coords[idim] = coords_int[idim] / (double) get_root_len (); } @@ -1542,7 +1551,9 @@ struct t8_standalone_scheme if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { double tmp_coords[T8_ELEMENT_DIM[TEclass]] = { 0.0 }; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; ++idim) { +#pragma GCC unroll 4 for (int jdim = 0; jdim < T8_ELEMENT_DIM[TEclass]; ++jdim) { tmp_coords[idim] += t8_standalone_lut_transform_coords[el->type][idim][jdim] * current_ref_coords[jdim]; @@ -1554,17 +1565,21 @@ struct t8_standalone_scheme for (size_t coord = 0; coord < num_coords; ++coord) { double tmp_out_coords[T8_ELEMENT_DIM[TEclass]] = {}; +#pragma GCC unroll 4 for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { current_out_coords[dim] = el->coords[dim] + tmp_coords[dim] * length; current_out_coords[dim] /= (double) get_root_len (); } +#pragma GCC unroll 4 for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { +#pragma GCC unroll 4 for (int jdim = 0; jdim < T8_ELEMENT_DIM[TEclass]; ++jdim) { tmp_out_coords[dim] += t8_standalone_lut_backtransform_coords[0][dim][jdim] * current_out_coords[jdim]; } } +#pragma GCC unroll 4 for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { current_out_coords[dim] = tmp_out_coords[dim]; } @@ -1576,6 +1591,7 @@ struct t8_standalone_scheme else { for (size_t coord = 0; coord < num_coords; ++coord) { +#pragma GCC unroll 4 for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { current_out_coords[dim] = el->coords[dim] + current_ref_coords[dim] * length; @@ -1784,6 +1800,7 @@ struct t8_standalone_scheme t8_standalone_element **els = (t8_standalone_element **) elements; for (unsigned int ielem = 0; ielem < count; ielem++) { +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { mpiret = sc_MPI_Pack (&(els[ielem]->coords[idim]), 1, sc_MPI_INT, send_buffer, buffer_size, position, comm); SC_CHECK_MPI (mpiret); @@ -1838,6 +1855,7 @@ struct t8_standalone_scheme t8_standalone_element **els = (t8_standalone_element **) elements; for (unsigned int ielem = 0; ielem < count; ielem++) { +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { mpiret = sc_MPI_Unpack (recvbuf, buffer_size, position, &(els[ielem]->coords[idim]), 1, sc_MPI_INT, comm); SC_CHECK_MPI (mpiret); @@ -1878,6 +1896,7 @@ struct t8_standalone_scheme /* The cube id of the root element is 0.*/ if (level != 0) { +#pragma GCC unroll 4 for (int i = 0; i < T8_ELEMENT_DIM[TEclass]; i++) { cube_id |= ((elem->coords[i] & h) ? 1 << i : 0); } @@ -1930,6 +1949,7 @@ struct t8_standalone_scheme Therefore this is the level needed so that all coordinates equal.*/ t8_element_coord maxexclor = 0; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { maxexclor |= (elem1->coords[idim] ^ elem2->coords[idim]); } @@ -1962,6 +1982,7 @@ struct t8_standalone_scheme element_cut_coordinates (t8_standalone_element *elem, const int shift) noexcept { T8_ASSERT (0 <= shift && shift <= T8_ELEMENT_MAXLEVEL[TEclass]); +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { elem->coords[idim] = (elem->coords[idim] >> shift) << shift; } @@ -1979,6 +2000,7 @@ struct t8_standalone_scheme set_coords_at_level_to_zero (const t8_standalone_element *elem, t8_standalone_element *parent_elem, const t8_element_coord length) noexcept { +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { parent_elem->coords[idim] = elem->coords[idim] & ~length; } @@ -1997,6 +2019,7 @@ struct t8_standalone_scheme put_cube_id_at_level (const t8_standalone_element *parent, t8_standalone_element *child, const t8_element_coord length, const t8_cube_id cube_id) noexcept { +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { child->coords[idim] = parent->coords[idim] + ((cube_id & (1 << idim)) ? length : 0); } @@ -2055,15 +2078,18 @@ struct t8_standalone_scheme t8_standalone_element *el = (t8_standalone_element *) elem; const int8_t type = el->type; int tmp_out_coords[T8_ELEMENT_DIM[TEclass]] = {}; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { coords[idim] = el->coords[idim] + t8_type_vertex_dim_to_binary[type][vertex][idim] * element_get_len (el->level); } +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; ++idim) { for (int jdim = 0; jdim < T8_ELEMENT_DIM[TEclass]; ++jdim) { tmp_out_coords[idim] += t8_standalone_lut_backtransform_coords[0][idim][jdim] * coords[jdim]; } } +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; ++idim) { coords[idim] = tmp_out_coords[idim]; } @@ -2071,6 +2097,7 @@ struct t8_standalone_scheme } else { //Hypercubes +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { coords[idim] = elem->coords[idim] + ((vertex & (1 << idim)) >> idim) * element_get_len (elem->level); } @@ -2221,6 +2248,7 @@ struct t8_standalone_scheme boundary->type = 0; boundary->level = el->level; /* Delete the coordinate orthogonal to the given face and combine the remaining coordinates*/ +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { const int ifacedim = get_facedim (idim, root_face); @@ -2240,6 +2268,7 @@ struct t8_standalone_scheme } } if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#pragma GCC unroll 4 for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { const int ifaceeq = t8_standalone_lut_rootface_eq_to_faceeq[root_face][ieq]; if (ifaceeq != -1) { @@ -2279,6 +2308,7 @@ struct t8_standalone_scheme else { el->level = face->level; +#pragma GCC unroll 4 for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { const int ifacedim = get_facedim (idim, root_face); @@ -2313,6 +2343,7 @@ struct t8_standalone_scheme else { u_int8_t root_type = 0; el->type = root_type; +#pragma GCC unroll 4 for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { const int ifaceeq = t8_standalone_lut_rootface_eq_to_faceeq[root_face][ieq]; if (ifaceeq != -1) { @@ -2320,6 +2351,7 @@ struct t8_standalone_scheme } } /** Set those typebits, that are connected to the face_normaldim of root_face*/ +#pragma GCC unroll 4 for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { const int facenormal_dim = t8_standalone_lut_type_face_to_facenormal_dim[root_type][root_face]; if (t8_type_edge_equations[ieq][0] == facenormal_dim) { @@ -2385,6 +2417,7 @@ struct t8_standalone_scheme u_int8_t type = 0; T8_ASSERT (0 <= el->level && el->level <= T8_ELEMENT_MAXLEVEL[TEclass]); +#pragma GCC unroll 4 for (int e = 0; e < T8_ELEMENT_NUM_EQUATIONS[TEclass]; e++) { t8_element_coord coord_v0 = el->coords[t8_type_edge_equations[e][0]]; t8_element_coord coord_v1 = el->coords[t8_type_edge_equations[e][1]]; From 92191a0e9d7b9e47c975a5a50cc252257bd4f0c3 Mon Sep 17 00:00:00 2001 From: Lukas Dreyer Date: Tue, 21 Oct 2025 16:34:05 +0200 Subject: [PATCH 2/3] add loop unroll cmake option --- CMakeLists.txt | 1 + src/CMakeLists.txt | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 84bed9ad77..acf5d22179 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option( T8CODE_ENABLE_MPI "Enable t8code's features which rely on MPI" ON ) option( T8CODE_ENABLE_VTK "Enable t8code's features which rely on VTK" OFF ) option( T8CODE_ENABLE_OCC "Enable t8code's features which rely on OpenCASCADE" OFF ) option( T8CODE_ENABLE_NETCDF "Enable t8code's features which rely on netCDF" OFF ) +option( T8CODE_ENABLE_STANDALONE_LOOP_UNROLL "Enables t8code's loop unroll in standalone scheme" OFF ) option( T8CODE_USE_SYSTEM_SC "Use system-installed sc library" OFF ) option( T8CODE_USE_SYSTEM_P4EST "Use system-installed p4est library" OFF ) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d4d4e90ce2..32e460bc20 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -51,6 +51,11 @@ if( T8CODE_EXPORT_COMPILE_COMMANDS ) set_target_properties( T8 PROPERTIES EXPORT_COMPILE_COMMANDS ON ) endif( T8CODE_EXPORT_COMPILE_COMMANDS ) +if( T8CODE_ENABLE_STANDALONE_LOOP_UNROLL ) + target_compile_definitions(T8 PUBLIC T8_ENABLE_STANDALONE_LOOP_UNROLL=1 ) +endif() + + if( T8CODE_ENABLE_NETCDF ) target_link_libraries( T8 PUBLIC NetCDF::NetCDF ) target_compile_definitions(T8 PUBLIC From 89f799be58eaa2f07682a36dd5c468ea8bd72ed1 Mon Sep 17 00:00:00 2001 From: Lukas Dreyer Date: Tue, 21 Oct 2025 16:35:17 +0200 Subject: [PATCH 3/3] add flag around unroll option --- .../t8_standalone_implementation.hxx | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx b/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx index 81d093ce22..fb0d968def 100644 --- a/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx +++ b/src/t8_schemes/t8_standalone/t8_standalone_implementation.hxx @@ -376,7 +376,9 @@ struct t8_standalone_scheme const t8_standalone_element *el2 = (const t8_standalone_element *) elem2; if (el1->level != el2->level) return 0; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { if (el1->coords[idim] != el2->coords[idim]) return 0; @@ -412,7 +414,9 @@ struct t8_standalone_scheme { t8_standalone_element *el = (t8_standalone_element *) elem; el->level = 0; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { el->coords[idim] = 0; } @@ -654,7 +658,9 @@ struct t8_standalone_scheme T8_ASSERT (0 <= el->level && el->level <= T8_ELEMENT_MAXLEVEL[TEclass]); int cube_id = 0; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { cube_id |= (el->coords[idim] & length) ? (1 << idim) : 0; } @@ -775,7 +781,9 @@ struct t8_standalone_scheme /* Shift the coords to the eighth cube. The type of the last descendant * is the type of the input element */ t8_element_coord coord_offset = element_get_len (el->level) - element_get_len (level); +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { d->coords[idim] |= coord_offset; } @@ -985,7 +993,9 @@ struct t8_standalone_scheme last_descendant->type = el->type; /**TODO: Check if this is always true! */ } +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { int multiplier = 1; if constexpr (!T8_ELEMENT_NUM_EQUATIONS[TEclass]) { @@ -1024,7 +1034,9 @@ struct t8_standalone_scheme } // all edges containing dim must be fulfilled with x_d-a_d >= x_j-a_j or x_j-a_j <= x_d-a_d if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ++ieq) { if ((t8_type_edge_equations[ieq][0] == dim && get_typebit (el->type, ieq)) || (t8_type_edge_equations[ieq][1] == dim && !get_typebit (el->type, ieq))) { @@ -1041,7 +1053,9 @@ struct t8_standalone_scheme } // all edges containing dimid must be fulfilled with x_d-a_d <= x_j-a_j or x_j-a_j >= x_d-a_d if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { if ((t8_type_edge_equations[ieq][0] == dim && !get_typebit (el->type, ieq)) || (t8_type_edge_equations[ieq][1] == dim && get_typebit (el->type, ieq))) { @@ -1153,7 +1167,9 @@ struct t8_standalone_scheme /**Adapt typebits*/ if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { /**For all neighboring typebits, change typebit*/ if (t8_type_edge_equations[ieq][0] == facenormal_dim @@ -1525,7 +1541,9 @@ struct t8_standalone_scheme int coords_int[T8_ELEMENT_DIM[TEclass]]; T8_ASSERT (0 <= vertex && vertex < T8_ELEMENT_NUM_CORNERS[TEclass]); element_compute_coords (el, vertex, coords_int); +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { coords[idim] = coords_int[idim] / (double) get_root_len (); } @@ -1551,9 +1569,13 @@ struct t8_standalone_scheme if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { double tmp_coords[T8_ELEMENT_DIM[TEclass]] = { 0.0 }; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; ++idim) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int jdim = 0; jdim < T8_ELEMENT_DIM[TEclass]; ++jdim) { tmp_coords[idim] += t8_standalone_lut_transform_coords[el->type][idim][jdim] * current_ref_coords[jdim]; @@ -1565,21 +1587,29 @@ struct t8_standalone_scheme for (size_t coord = 0; coord < num_coords; ++coord) { double tmp_out_coords[T8_ELEMENT_DIM[TEclass]] = {}; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { current_out_coords[dim] = el->coords[dim] + tmp_coords[dim] * length; current_out_coords[dim] /= (double) get_root_len (); } +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int jdim = 0; jdim < T8_ELEMENT_DIM[TEclass]; ++jdim) { tmp_out_coords[dim] += t8_standalone_lut_backtransform_coords[0][dim][jdim] * current_out_coords[jdim]; } } +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { current_out_coords[dim] = tmp_out_coords[dim]; } @@ -1591,7 +1621,9 @@ struct t8_standalone_scheme else { for (size_t coord = 0; coord < num_coords; ++coord) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int dim = 0; dim < T8_ELEMENT_DIM[TEclass]; ++dim) { current_out_coords[dim] = el->coords[dim] + current_ref_coords[dim] * length; @@ -1800,7 +1832,9 @@ struct t8_standalone_scheme t8_standalone_element **els = (t8_standalone_element **) elements; for (unsigned int ielem = 0; ielem < count; ielem++) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { mpiret = sc_MPI_Pack (&(els[ielem]->coords[idim]), 1, sc_MPI_INT, send_buffer, buffer_size, position, comm); SC_CHECK_MPI (mpiret); @@ -1855,7 +1889,9 @@ struct t8_standalone_scheme t8_standalone_element **els = (t8_standalone_element **) elements; for (unsigned int ielem = 0; ielem < count; ielem++) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { mpiret = sc_MPI_Unpack (recvbuf, buffer_size, position, &(els[ielem]->coords[idim]), 1, sc_MPI_INT, comm); SC_CHECK_MPI (mpiret); @@ -1896,7 +1932,9 @@ struct t8_standalone_scheme /* The cube id of the root element is 0.*/ if (level != 0) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int i = 0; i < T8_ELEMENT_DIM[TEclass]; i++) { cube_id |= ((elem->coords[i] & h) ? 1 << i : 0); } @@ -1949,7 +1987,9 @@ struct t8_standalone_scheme Therefore this is the level needed so that all coordinates equal.*/ t8_element_coord maxexclor = 0; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { maxexclor |= (elem1->coords[idim] ^ elem2->coords[idim]); } @@ -1982,7 +2022,9 @@ struct t8_standalone_scheme element_cut_coordinates (t8_standalone_element *elem, const int shift) noexcept { T8_ASSERT (0 <= shift && shift <= T8_ELEMENT_MAXLEVEL[TEclass]); +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { elem->coords[idim] = (elem->coords[idim] >> shift) << shift; } @@ -2000,7 +2042,9 @@ struct t8_standalone_scheme set_coords_at_level_to_zero (const t8_standalone_element *elem, t8_standalone_element *parent_elem, const t8_element_coord length) noexcept { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { parent_elem->coords[idim] = elem->coords[idim] & ~length; } @@ -2019,7 +2063,9 @@ struct t8_standalone_scheme put_cube_id_at_level (const t8_standalone_element *parent, t8_standalone_element *child, const t8_element_coord length, const t8_cube_id cube_id) noexcept { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { child->coords[idim] = parent->coords[idim] + ((cube_id & (1 << idim)) ? length : 0); } @@ -2078,18 +2124,24 @@ struct t8_standalone_scheme t8_standalone_element *el = (t8_standalone_element *) elem; const int8_t type = el->type; int tmp_out_coords[T8_ELEMENT_DIM[TEclass]] = {}; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { coords[idim] = el->coords[idim] + t8_type_vertex_dim_to_binary[type][vertex][idim] * element_get_len (el->level); } +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; ++idim) { for (int jdim = 0; jdim < T8_ELEMENT_DIM[TEclass]; ++jdim) { tmp_out_coords[idim] += t8_standalone_lut_backtransform_coords[0][idim][jdim] * coords[jdim]; } } +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; ++idim) { coords[idim] = tmp_out_coords[idim]; } @@ -2097,7 +2149,9 @@ struct t8_standalone_scheme } else { //Hypercubes +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { coords[idim] = elem->coords[idim] + ((vertex & (1 << idim)) >> idim) * element_get_len (elem->level); } @@ -2248,7 +2302,9 @@ struct t8_standalone_scheme boundary->type = 0; boundary->level = el->level; /* Delete the coordinate orthogonal to the given face and combine the remaining coordinates*/ +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { const int ifacedim = get_facedim (idim, root_face); @@ -2268,7 +2324,9 @@ struct t8_standalone_scheme } } if constexpr (T8_ELEMENT_NUM_EQUATIONS[TEclass]) { +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { const int ifaceeq = t8_standalone_lut_rootface_eq_to_faceeq[root_face][ieq]; if (ifaceeq != -1) { @@ -2308,7 +2366,9 @@ struct t8_standalone_scheme else { el->level = face->level; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int idim = 0; idim < T8_ELEMENT_DIM[TEclass]; idim++) { const int ifacedim = get_facedim (idim, root_face); @@ -2343,7 +2403,9 @@ struct t8_standalone_scheme else { u_int8_t root_type = 0; el->type = root_type; +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { const int ifaceeq = t8_standalone_lut_rootface_eq_to_faceeq[root_face][ieq]; if (ifaceeq != -1) { @@ -2351,7 +2413,9 @@ struct t8_standalone_scheme } } /** Set those typebits, that are connected to the face_normaldim of root_face*/ +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int ieq = 0; ieq < T8_ELEMENT_NUM_EQUATIONS[TEclass]; ieq++) { const int facenormal_dim = t8_standalone_lut_type_face_to_facenormal_dim[root_type][root_face]; if (t8_type_edge_equations[ieq][0] == facenormal_dim) { @@ -2417,7 +2481,9 @@ struct t8_standalone_scheme u_int8_t type = 0; T8_ASSERT (0 <= el->level && el->level <= T8_ELEMENT_MAXLEVEL[TEclass]); +#if T8_ENABLE_STANDALONE_LOOP_UNROLL #pragma GCC unroll 4 +#endif for (int e = 0; e < T8_ELEMENT_NUM_EQUATIONS[TEclass]; e++) { t8_element_coord coord_v0 = el->coords[t8_type_edge_equations[e][0]]; t8_element_coord coord_v1 = el->coords[t8_type_edge_equations[e][1]];