diff --git a/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf b/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf index d465856359..73b9f70566 100644 Binary files a/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf and b/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf differ diff --git a/ChangeLog b/ChangeLog index 580fff8dda..74cc008d39 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Mar 3, 2021, SuiteSparse 5.9.0 + + * GraphBLAS v4.0.3: many new features, much faster performance + July 14, 2020, SuiteSparse 5.8.1 * SLIP_LU v1.0.2: resolved issue #51 diff --git a/GraphBLAS/.gitignore b/GraphBLAS/.gitignore index 5a4ebfdb17..5b52c7158d 100644 --- a/GraphBLAS/.gitignore +++ b/GraphBLAS/.gitignore @@ -36,6 +36,7 @@ my_*.out .nfs* .pyc *.tmp +gunk* Demo/bfs_demo.out Demo/mis_demo.out @@ -60,6 +61,7 @@ alternative/*_demo Test/*.log Test/errlog.txt Test/errlog2.txt +Test/errlog3.txt Test/log.txt Test/gunk* Test/fprint.txt @@ -69,6 +71,7 @@ Doc/GraphBLAS_UserGuide.out Tcov/errlog.txt Tcov/errlog2.txt +Tcov/errlog3.txt Tcov/log.txt Tcov/grbstat.mat Tcov/fprint.txt diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt index f74d3efaa9..c5ea314a8c 100644 --- a/GraphBLAS/CMakeLists.txt +++ b/GraphBLAS/CMakeLists.txt @@ -2,8 +2,8 @@ # GraphBLAS/CMakeLists.txt: cmake script for GraphBLAS #------------------------------------------------------------------------------- -# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved. -# http://suitesparse.com See GraphBLAS/Doc/License.txt for license. +# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 # CMakeLists.txt: instructions for cmake to build GraphBLAS. # An ANSI C11 compiler is required. @@ -59,9 +59,9 @@ endif ( ) set ( CMAKE_MACOSX_RPATH TRUE ) # version of SuiteSparse:GraphBLAS -set ( GraphBLAS_DATE "July 14, 2020" ) -set ( GraphBLAS_VERSION_MAJOR 3 ) -set ( GraphBLAS_VERSION_MINOR 3 ) +set ( GraphBLAS_DATE "Jan 19, 2021") +set ( GraphBLAS_VERSION_MAJOR 4 ) +set ( GraphBLAS_VERSION_MINOR 0 ) set ( GraphBLAS_VERSION_SUB 3 ) # GraphBLAS C API Specification version, at graphblas.org @@ -101,12 +101,14 @@ else ( ) message ( STATUS "Building dynamic GraphBLAS library only" ) endif ( ) -# select "true" to enable burble, for GraphBLAS development only -# set ( GB_BURBLE true ) - set ( GB_BURBLE false ) +# select "false" to disable the burble. It is now enabled by default. + set ( GB_BURBLE true ) +# set ( GB_BURBLE false ) if ( GB_BURBLE ) set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGB_BURBLE=1 " ) +else ( ) + set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGB_BURBLE=0 " ) endif ( ) if ( GBCOMPACT ) @@ -159,7 +161,7 @@ if ( CMAKE_CUDA ) message ( STATUS "CUDA enabled" ) set ( CMAKE_CUDA_FLAG " -DGBCUDA" ) set ( GB_CUDA graphblascuda cuda cudadevrt cudart nvrtc ) - link_directories ( "CUDA" "/usr/local/cuda/lib64" ) + link_directories ( "CUDA" "/usr/local/cuda/lib64" "/usr/local/cuda/lib64/stubs" ) else ( ) message ( STATUS "CUDA not enabled" ) set ( CMAKE_CUDA_FLAG " " ) @@ -180,8 +182,6 @@ endif ( ) message ( STATUS "CMAKE compiler ID: " ${CMAKE_C_COMPILER_ID} ) message ( STATUS "CMAKE thread library: " ${CMAKE_THREAD_LIBS_INIT} ) -message ( STATUS "CMAKE have pthreads: " ${CMAKE_USE_PTHREADS_INIT} ) -message ( STATUS "CMAKE have Win32 pthreads: " ${CMAKE_USE_WIN32_THREADS_INIT} ) message ( STATUS "CMAKE have OpenMP: " ${OPENMP_FOUND} ) #------------------------------------------------------------------------------- @@ -238,6 +238,9 @@ include_directories ( Source/Template Source Source/Generated Source/Generator I # check which compiler is being used. If you need to make # compiler-specific modifications, here is the place to do it. if ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") + # The -g option is useful for the Intel VTune tool, but it should be + # removed in production. Comment this line out if not in use: + # set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g" ) # cmake 2.8 workaround: gcc needs to be told to do ANSI C11. # cmake 3.0 doesn't have this problem. set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11 -lm -Wno-pragmas " ) @@ -251,7 +254,6 @@ if ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fwrapv ") # check all warnings (uncomment for development only) # set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Wpedantic -Werror " ) - # set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g" ) if ( CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9 ) message ( FATAL_ERROR "gcc version must be at least 4.9" ) endif ( ) @@ -274,6 +276,7 @@ elseif ( "${CMAKE_C_COMPILER_ID}" STREQUAL "Intel" ) endif ( ) elseif ( "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" ) # options for clang + set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-sign " ) if ( CMAKE_C_COMPILER_VERSION VERSION_LESS 3.3 ) message ( FATAL_ERROR "clang version must be at least 3.3" ) endif ( ) @@ -349,25 +352,10 @@ endif ( ) # select the threading library #------------------------------------------------------------------------------- -if ( USER_OPENMP ) - # user insists on OpenMP synchronization inside GraphBLAS - message ( STATUS "cmake -DUSER_OPENMP=1: insisting on using OpenMP" ) +if ( OPENMP_FOUND ) +# set this to 'false' if you do not want OpenMP +# set ( USE_OPENMP false ) set ( USE_OPENMP true ) -elseif ( USER_POSIX ) - # user insists on POSIX synchronization inside GraphBLAS - message ( STATUS "cmake -DUSER_POSIX=1: insisting on using POSIX" ) - set ( USE_POSIX true ) -elseif ( USER_NONE ) - message ( STATUS "cmake -DUSER_NONE=1: insisting on using no threading" ) - set ( USE_NONE true ) -else ( ) - # default: automatic selection - message ( STATUS "Automatic selection of synchronization method for user threads" ) - if ( OPENMP_FOUND ) - set ( USE_OPENMP true ) - elseif ( CMAKE_USE_PTHREADS_INIT ) - set ( USE_POSIX true ) - endif ( ) endif ( ) #------------------------------------------------------------------------------- @@ -380,67 +368,44 @@ else ( ) set ( M_LIB "m" ) endif ( ) +target_link_libraries ( graphblas ${M_LIB} ) +if ( BUILD_GRB_STATIC_LIBRARY ) + target_link_libraries ( graphblas_static ${M_LIB} ) +endif ( ) + #------------------------------------------------------------------------------- -# add the threading library +# add the OpenMP, CUDA, BLAS, ... libraries #------------------------------------------------------------------------------- if ( USE_OPENMP ) - # use OpenMP for user thread synchronization - message ( STATUS "Using OpenMP to synchronize user threads" ) - target_link_libraries ( graphblas ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} ) - if ( BUILD_GRB_STATIC_LIBRARY ) - target_link_libraries ( graphblas_static ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} ) - endif ( ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -DUSER_OPENMP_THREADS " ) -elseif ( USE_POSIX ) - # use POSIX for user thread synchronization - message ( STATUS "Using POSIX pthreads to synchronize user threads" ) - target_link_libraries ( graphblas ${M_LIB} ${GB_CUDA} ) + target_link_libraries ( graphblas ${OpenMP_C_LIBRARIES} ) if ( BUILD_GRB_STATIC_LIBRARY ) - target_link_libraries ( graphblas_static ${M_LIB} ${GB_CUDA} ) + target_link_libraries ( graphblas_static ${OpenMP_C_LIBRARIES} ) endif ( ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -DUSER_POSIX_THREADS " ) -else ( ) - # use no threading at all - message ( WARNING "No support for user threads; GraphBLAS will not be thread-safe" ) - target_link_libraries ( graphblas ${M_LIB} ${GB_CUDA} ) - if ( BUILD_GRB_STATIC_LIBRARY ) - target_link_libraries ( graphblas_static ${M_LIB} ${GB_CUDA} ) - endif ( ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSER_NO_THREADS " ) + set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} " ) endif ( ) -if ( CMAKE_THREAD_LIBS_INIT ) - target_link_libraries ( graphblas ${CMAKE_THREAD_LIBS_INIT} ${GB_CUDA} ) +if ( CMAKE_CUDA ) + target_link_libraries ( graphblas ${GB_CUDA} ) if ( BUILD_GRB_STATIC_LIBRARY ) - target_link_libraries ( graphblas_static ${CMAKE_THREAD_LIBS_INIT} ${GB_CUDA} ) + target_link_libraries ( graphblas_static ${GB_CUDA} ) endif ( ) endif ( ) -if ( OPENMP_FOUND ) - # use OpenMP for internal parallelism - message ( STATUS "Using OpenMP for internal parallelism" ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" ) - target_link_libraries ( graphblas ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} ) +if ( CMAKE_THREAD_LIBS_INIT ) + target_link_libraries ( graphblas ${CMAKE_THREAD_LIBS_INIT} ) if ( BUILD_GRB_STATIC_LIBRARY ) - target_link_libraries ( graphblas_static ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} ) + target_link_libraries ( graphblas_static ${CMAKE_THREAD_LIBS_INIT} ) endif ( ) endif ( ) -if ( CMAKE_USE_PTHREADS_INIT ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_PTHREADS " ) -endif ( ) - -if ( CMAKE_USE_WIN32_THREADS_INIT ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_WINDOWS_THREADS " ) -endif ( ) - -if ( BLAS_FOUND ) - # use the dense CBLAS - message ( STATUS "Using dense CBLAS for faster dense matrix/vector operations" ) - set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGB_HAS_CBLAS " ) - target_link_libraries ( graphblas ${BLAS_LIBRARIES} ) -endif ( ) +# FUTURE: +# if ( BLAS_FOUND ) +# # use the dense CBLAS +# message ( STATUS "Using dense CBLAS for faster dense matrix/vector operations" ) +# set ( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGB_HAS_CBLAS " ) +# target_link_libraries ( graphblas ${BLAS_LIBRARIES} ) +# endif ( ) #------------------------------------------------------------------------------- # determine the default matrix format @@ -487,7 +452,6 @@ endif ( ) add_executable ( pagerank_demo "Demo/Program/pagerank_demo.c" ) add_executable ( bfs_demo "Demo/Program/bfs_demo.c" ) add_executable ( tri_demo "Demo/Program/tri_demo.c" ) -add_executable ( pthread_demo "Demo/Program/pthread_demo.c" ) add_executable ( openmp_demo "Demo/Program/openmp_demo.c" ) add_executable ( mis_demo "Demo/Program/mis_demo.c" ) add_executable ( complex_demo "Demo/Program/complex_demo.c" ) @@ -501,7 +465,6 @@ add_executable ( import_demo "Demo/Program/import_demo.c" ) target_link_libraries ( pagerank_demo graphblas graphblasdemo ${GB_CUDA} ) target_link_libraries ( bfs_demo graphblas graphblasdemo ${GB_CUDA} ) target_link_libraries ( tri_demo graphblas graphblasdemo ${GB_CUDA} ) -target_link_libraries ( pthread_demo graphblas graphblasdemo ${GB_CUDA} ) target_link_libraries ( openmp_demo graphblas graphblasdemo ${GB_CUDA} ) target_link_libraries ( mis_demo graphblas graphblasdemo ${GB_CUDA} ) target_link_libraries ( complex_demo graphblas graphblasdemo ${GB_CUDA} ) diff --git a/GraphBLAS/Extras/tri/.gitignore b/GraphBLAS/CUDA/.gitignore similarity index 68% rename from GraphBLAS/Extras/tri/.gitignore rename to GraphBLAS/CUDA/.gitignore index 36ca52a96d..5db082e1af 100644 --- a/GraphBLAS/Extras/tri/.gitignore +++ b/GraphBLAS/CUDA/.gitignore @@ -1,6 +1,9 @@ # Ignore these files: *.o -tri_main +*.a +*.so +jitFactory +stringify # Do not ignore this file !.gitignore diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cu b/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cu new file mode 100644 index 0000000000..fcb9294910 --- /dev/null +++ b/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cu @@ -0,0 +1,693 @@ +//------------------------------------------------------------------------------ +// GB_AxB_dot3_cuda: compute C = A'*B in parallel, on the GPU(s) +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +// This function only computes C=A'*B on the GPUs. The mask must be +// present, and not complemented. The mask is always applied. + +extern "C" +{ + #include "GB_mxm.h" +} +#include "GB_cuda.h" + + +#include "templates/GB_jit_AxB_dot3_phase1.cu.jit" +#include "templates/GB_jit_AxB_dot3_phase2.cu.jit" +// the 5 kernels for the 5 buckets: +#include "templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit" +#include "templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit" +#include "templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit" +#include "templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit" +#include "templates/GB_jit_AxB_dot3_phase3_mp.cu.jit" +#include "templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit" +#include "templates/reduceNonZombiesWarp.cu.jit" + +#include "GB_jit_launcher.h" + + +const std::vector header_names ={}; + + +#define GB_FREE_WORK \ +{ \ + GB_cuda_free (Nanobuckets) ; Nanobuckets = NULL ; \ + GB_cuda_free (Blockbucket) ; Blockbucket = NULL ; \ + GB_cuda_free (Bucket); Bucket = NULL; \ + GB_cuda_free (Bucketp); Bucketp = NULL; \ + GB_cuda_free (offset); offset = NULL; \ +} + +#define GB_FREE_ALL \ +{ \ + GB_FREE_WORK ; \ + GrB_Matrix_free (Chandle) ; \ +} + +GrB_Info GB_AxB_dot3_cuda // C = A'*B using dot product method +( + GrB_Matrix *Chandle, // output matrix + const GrB_Matrix M, // mask matrix + const bool Mask_struct, // if true, use the only structure of M + const GrB_Matrix A, // input matrix + const GrB_Matrix B, // input matrix + const GrB_Semiring semiring, // semiring that defines C=A*B + const bool flipxy, // if true, do z=fmult(b,a) vs fmult(a,b) + GB_Context Context +) +{ + + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + + GrB_Info info ; + ASSERT (Chandle != NULL) ; + ASSERT (*Chandle == NULL) ; + + ASSERT_MATRIX_OK (M, "M for dot3 cuda A'*B", GB0) ; + ASSERT_MATRIX_OK (A, "A for dot3 cuda A'*B", GB0) ; + ASSERT_MATRIX_OK (B, "B for dot3 cuda A'*B", GB0) ; + + ASSERT (!GB_PENDING (M)) ; + ASSERT (GB_JUMBLED_OK (M)) ; + ASSERT (!GB_ZOMBIES (M)) ; + + ASSERT (!GB_PENDING (A)) ; + ASSERT (!GB_JUMBLED (A)) ; + ASSERT (!GB_ZOMBIES (A)) ; + + ASSERT (!GB_PENDING (B)) ; + ASSERT (!GB_ZOMBIES (B)) ; + ASSERT (!GB_JUMBLED (B)) ; + + ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ; + + ASSERT (A->vlen == B->vlen) ; + GBURBLE ("(GPU dot3) ") ; + + //-------------------------------------------------------------------------- + // initializations + //-------------------------------------------------------------------------- + + int ntasks = 0, number_of_sms = 0 ; + int64_t *Nanobuckets = NULL, *Blockbucket = NULL ; + int64_t *Bucket = NULL; + int64_t *Bucketp = NULL; + int64_t *offset = NULL; + (*Chandle) = NULL ; + + // just in case M is jumbled and we don't handle it yet (TODO) + GB_MATRIX_WAIT (M) ; + ASSERT (!GB_JUMBLED (M)) ; + + int device = -1; + + cudaSetDevice( 0 ) ; + + cudaGetDevice(&device); + + //-------------------------------------------------------------------------- + // get M + //-------------------------------------------------------------------------- + + const int64_t *restrict Mp = M->p ; + const int64_t *restrict Mh = M->h ; + // const int64_t *restrict Mi = M->i ; + // const GB_void *restrict Mx = M->x ; + // const size_t msize = M->type->size ; + const int64_t mvlen = M->vlen ; + const int64_t mvdim = M->vdim ; + const int64_t mnz = GB_NNZ (M) ; + const int64_t mnvec = M->nvec ; + const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ; + + const int64_t anz = GB_NNZ (A) ; + const int64_t anvec = A->nvec ; + + const int64_t bnz = GB_NNZ (B) ; + const int64_t bnvec = B->nvec ; + + //-------------------------------------------------------------------------- + // allocate C, the same size and # of entries as M + //-------------------------------------------------------------------------- + + // FUTURE: ctype need not be the op->ztype + GrB_Type ctype = semiring->add->op->ztype ; + int64_t cvlen = mvlen ; + int64_t cvdim = mvdim ; + int64_t cnz = mnz ; + int64_t cnvec = mnvec ; + + // TODO tell GB_CREATE where to put the data: CPU or GPU (via + // cudaMemAdvise), but this works as-is. + int sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ; + info = GB_new_bix (Chandle, // sparse or hyper (from M), new header + ctype, cvlen, cvdim, GB_Ap_malloc, true, + sparsity, false, M->hyper_switch, cnvec, + cnz+1, // add one to cnz for GB_cumsum of Cwork + true, Context) ; + + if (info != GrB_SUCCESS) + { + // out of memory + GB_FREE_ALL ; + return (info) ; + } + + GrB_Matrix C = (*Chandle) ; + //int64_t *Citemp = C->i ; + //auto *Cxtemp = C->x ; + //cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) ); + //cudaMalloc ((void**) &(C->x), cnz * C->type->size ); + cudaMemAdvise( C->i, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); + cudaMemAdvise( C->x, cnz * C->type->size , cudaMemAdviseSetPreferredLocation, device); + + int64_t *restrict Cp = M->p ; + int64_t *restrict Ch = M->h ; + // int64_t *restrict Ci = C->i ; + // use C->i as workspace + + //-------------------------------------------------------------------------- + // copy Mp and Mh into C + //-------------------------------------------------------------------------- + + //cudaMemcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), cudaMemcpyDefault) ; + if (M_is_hyper) + { + //cudaMemcpy (Ch, Mh, cnvec * sizeof (int64_t), cudaMemcpyDefault) ; + } + C->magic = GB_MAGIC ; + C->nvec_nonempty = M->nvec_nonempty ; + C->nvec = M->nvec ; + + GBURBLE ("(GPU C created and copied from M) ") ; + //-------------------------------------------------------------------------- + // stringify the semiring and the mask + //-------------------------------------------------------------------------- + + char semiring_name [GB_CUDA_STRLEN+2] ; + char semiring_code [GB_CUDA_STRLEN+2] ; + char mask_name [GB_CUDA_STRLEN+2] ; + + GB_cuda_stringify_semiring (semiring, flipxy, + ctype, A->type, B->type, M->type, Mask_struct, // matrix types + true, semiring_name, semiring_code, mask_name) ; + + GBURBLE ("(GPU stringified) ") ; + //-------------------------------------------------------------------------- + // construct the tasks for phase1 and phase2 + //-------------------------------------------------------------------------- + + // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ; + // on the GPU: + + // # of threads in phase1 and phase2 kernel launches must be the same + #define chunksize 128 + #define SYMBOLIC_PHASE_NTHREADS 32 + #define NBUCKETS (GB_BUCKET_MERGEPATH + 1) + + number_of_sms = GB_Global_gpu_sm_get (0) ; + // C and M have cnz entries, so create ... + //ntasks = ( (mnvec +7)/8 + SYMBOLIC_PHASE_NTHREADS -1 )/SYMBOLIC_PHASE_NTHREADS; + ntasks = ( mnz +chunksize -1)/chunksize; + // Idea is to have each task work on a continguous block of columns of C + ntasks = GB_IMIN( ntasks, 128*number_of_sms) ; // ntasks will be grid.x + + GBURBLE ("(GPU mnz=%ld mnvec=%ld blockDim=32, nblock= %d) ", mnz, mnvec, ntasks ) ; + + std::cout<< "ntasks, nthreads = " <p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetPreferredLocation, device) ; + cudaMemAdvise( M->i, mnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); + cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetPreferredLocation,device) ; + + cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( M->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetReadMostly,device) ; + */ + + cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ; + cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL ) ; //stream_data) ; + cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL ) ; //stream_data) ; + /* + cudaMemAdvise( C->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( C->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( C->x, mnz * C->type->size, cudaMemAdviseSetReadMostly,device) ; + */ + //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ; + cudaMemPrefetchAsync( C->i, mnz * sizeof (int64_t), device, NULL ); //stream_data) ; + cudaMemPrefetchAsync( C->x, mnz * C->type->size, device, NULL ); //stream_data) ; + + /* + cudaMemAdvise( A->p, (anvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( A->i, anz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( A->x, anz * A->type->size, cudaMemAdviseSetReadMostly,device) ; + */ + cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL); // stream_data) ; + cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL ) ; //stream_data) ; + cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL ) ; //stream_data) ; + + /* + cudaMemAdvise( B->p, (bnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( B->i, bnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ; + cudaMemAdvise( B->x, bnz * B->type->size, cudaMemAdviseSetReadMostly, device) ; + */ + cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ; + cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL ) ; //stream_data) ; + cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL ) ; //stream_data) ; + + + + // The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in + // as the pth entry in C. + GB_callback mysemiring; + const char *header_name = (const char *)"mySemiRing.h"; + mysemiring.load_string(header_name, semiring_code ) ; + SR_callback_ptr = &mysemiring; + + + //cudaStream_t stream_AxB; + //cudaStreamCreate ( &stream_AxB); + //---------------------------------------------------------------------- + // phase1: assign each C(i,j) to a bucket, and count them + //---------------------------------------------------------------------- + dim3 grid( ntasks) ; + dim3 p2grid( (ntasks + SYMBOLIC_PHASE_NTHREADS -1) + / (SYMBOLIC_PHASE_NTHREADS) ) ; + dim3 block( SYMBOLIC_PHASE_NTHREADS ) ; + + std::string base_name = "GB_jit_AxB_dot3_"; + std::string Opname = "phase1_" ; + + jitify::experimental::KernelLauncher phase1Kernel = + jit::launcher( base_name + Opname + mask_name, + templates_GB_jit_AxB_dot3_phase1_cu, + header_names, + compiler_flags, + callback_wrapper) //, + //stream_AxB) + .set_kernel_inst("GB_AxB_cuda_dot3_phase1", + {M->type->name}) + .configure(grid, block); + + //---------------------------------------------------------------------- + // phase2: cumsum across the blockbuckets, propagate to thread level + //---------------------------------------------------------------------- + base_name = "GB_jit_AxB_dot3_"; + Opname = "phase2"; + jitify::experimental::KernelLauncher phase2Kernel = + jit::launcher( base_name + Opname, + templates_GB_jit_AxB_dot3_phase2_cu, + header_names, + compiler_flags, + callback_wrapper) //, + //stream_AxB) + .set_kernel_inst("GB_AxB_dot3_phase2", + {}) + .configure(p2grid, block); + + base_name = "GB_jit_AxB_dot3_"; + Opname = "phase2"; + jitify::experimental::KernelLauncher phase2endKernel = + jit::launcher( base_name + Opname, + templates_GB_jit_AxB_dot3_phase2_cu, + header_names, + compiler_flags, + callback_wrapper) //, + //stream_AxB) + .set_kernel_inst("GB_AxB_dot3_phase2end", + {}) + .configure(grid, block); + + + phase1Kernel.launch( + Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x + Blockbucket, // bucket counts, of size NBUCKETS-by-gridDim.x + // input/output: + C, // final output matrix + // inputs, not modified: + M, // mask matrix + A, // input matrix + B // input matrix + ); + + + // cudaDeviceSynchronize(); + + + GBURBLE ("(GPU phase1 done) ") ; + //for (int i = 0; i< cnz; i++){ + // printf("C[%d] = %ld\n", i , Ci[i]); + //} + //---------------------------------------------------------------------- + // phase2: cumsum across the blockbuckets, propagate to thread level + //---------------------------------------------------------------------- + int nblock = ntasks; + + phase2Kernel.launch( // input + Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x + Blockbucket, // bucket counts, of size NBUCKETS-by-gridDim.x + // input/output: + Bucketp, // global bucket cumsum, of size NBUCKETS+1 + Bucket, // global buckets, of size cnz (== mnz) + offset, + C, // final output matrix + // inputs, not modified: + cnz, // number of entries in mask and output matrix + nblock + ); + + cudaDeviceSynchronize(); + //cudaMemPrefetchAsync( offset, (NBUCKETS) * sizeof (int64_t), cudaCpuDeviceId, NULL) ; + + int64_t s= 0; + for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket) + { + Bucketp[bucket] = s; + s+= offset[bucket]; + //printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]); + } + + GBURBLE ("(GPU phase2 done) ") ; + + phase2endKernel.launch( // input + Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x + Blockbucket, // bucket counts, of size NBUCKETS-by-gridDim.x + // input/output: + Bucketp, // global bucket cumsum, of size NBUCKETS+1 + Bucket, // global buckets, of size cnz (== mnz) + offset, + C, // final output matrix + // inputs, not modified: + cnz // number of entries in mask and output matrix + ); + + cudaDeviceSynchronize(); + + GBURBLE ("(GPU phase2end done) ") ; + /* + for (int i = 0; i< cnz; i++){ + printf("C[%d],Bucket = %ld,%ld\n", i , Ci[i], Bucket[i]); + } + */ + + //---------------------------------------------------------------------- + // phase3: do the numerical work + //---------------------------------------------------------------------- + + base_name = "GB_jit_"; + std::string kernel_name = "AxB_dot3_phase3_"; + C->nzombies = Bucketp[1]; //set pre-zombie counts + + for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket) + { + std::string Opname = ""; + int sz = 0 ; + + const char* jit_template; + + int64_t start = Bucketp[bucket]; + int64_t end = Bucketp[bucket+1]; + + //if( (end-start>0) && (start == Bucketp[1]) ) start = Bucketp[0]; //add in zombie slots + + int64_t Cnz = end- start; + + int gridsz, blocksz; + + //Nothing to do, next bucket + if ( Cnz == 0 ) continue; + + GBURBLE ("\n\n(GPU phase3 bucket,bucketsize= %d,%ld) ",bucket,Cnz) ; + + switch (bucket) + { + + //-------------------------------------------------------------- + // not a bucket ... bring out your dead: + //-------------------------------------------------------------- + + case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket) + break ; + + //-------------------------------------------------------------- + // CUDA kernel: dndn, handles a single bucket: + //-------------------------------------------------------------- + + // both A(:,i) and B(:,j) are dense + case GB_BUCKET_DNDN : + Opname = "dndn" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_dndn_cu; + blocksz = 32; + gridsz = ( Cnz -1 + blocksz)/blocksz; + break ; + + //-------------------------------------------------------------- + // CUDA kernel: spdn, handles 4 buckets: + //-------------------------------------------------------------- + + // A(:,i) is dense and B(:,j) is very sparse (< 256 entries) + case GB_BUCKET_DNVS : + // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense + case GB_BUCKET_VSDN : + sz = 64 ; + Opname = "spdn" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu; + blocksz = 32; + gridsz = ( Cnz -1 + blocksz)/blocksz; + break ; + + // A(:,i) is dense and B(:,j) is sparse (>= 256 entries) + case GB_BUCKET_DNSP : + // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense + case GB_BUCKET_SPDN : + sz = 256 ; + Opname = "spdn" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu; + blocksz = 32; + gridsz = ( Cnz -1 + blocksz)/blocksz; + break ; + + //-------------------------------------------------------------- + // CUDA kernel: vssp, handles 1 bucket, uses binary search: + //-------------------------------------------------------------- + + // A(:,i) is very sparse compared to B(:,j), or visa versa + case GB_BUCKET_VSSP : + Opname = "vssp" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_vssp_cu; + blocksz = 32; + gridsz = ( Cnz -1 + blocksz)/blocksz; + break ; + + //-------------------------------------------------------------- + // CUDA kernel: vsvs, handles 4 buckets: + //-------------------------------------------------------------- + + // let len = nnz (A (:,i) + nnz (B (:,j)), then: + + case GB_BUCKET_VSVS_256 : sz += 256-64 ; + case GB_BUCKET_VSVS_64 : sz += 64-16 ; + case GB_BUCKET_VSVS_16 : sz += 16-4 ; + case GB_BUCKET_VSVS_4 : sz += 4 ; + Opname = "vsvs" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_vsvs_cu; + blocksz = 1024; + gridsz = GB_IMIN( 1024*number_of_sms, ( Cnz + blocksz -1 )/blocksz); + gridsz = ( Cnz + blocksz -1 )/blocksz; + /* + Opname = "warpix" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu; + blocksz = 32; + gridsz = GB_IMIN( (mnvec+15)/16, 256*number_of_sms); + */ + break ; + + //-------------------------------------------------------------- + // CUDA kernel: mp, use the merge-path method: + //-------------------------------------------------------------- + + case GB_BUCKET_MERGEPATH : + Opname = "mp" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_mp_cu; + blocksz = 32; + gridsz = ( Cnz -1 + blocksz)/blocksz; + break ; + + case GB_BUCKET_WARP_IX : sz = 32 ; + Opname = "warpix" ; + jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu; + blocksz = 32; + gridsz = GB_IMIN( (mnvec+15)/16, 256*number_of_sms); + break ; + + default: + break ; + } + + dim3 grid(gridsz); + dim3 block(blocksz); + + std::cout<< "Kernel name =" <name, + A->type->name, + B->type->name, + semiring->multiply->xtype->name, + semiring->multiply->ytype->name, + semiring->multiply->ztype->name }) + .configure(grid, block) //if commented, use implicit 1D configure in launch + .launch( + start, // input/output: + end, // global bucket cumsum, of size NBUCKETS+1 + Bucket, // global buckets, of size cnz (== mnz) + C, // final output matrix + // inputs, not modified: + M, // Mi used for column index + A, // A matrix + B, // B matrix + sz // only used for sparse-sparse cases + + ); + + cudaDeviceSynchronize(); + } + GBURBLE ("(GPU phase3 done) ") ; + + std::string reduce_kernel_name = "reduceNonZombiesWarp"; + const char* jit_template; + #define red_blocksz 1024 + jit_template = templates_reduceNonZombiesWarp_cu; + int num_reduce_blocks = GB_IMIN( 32*number_of_sms, (cnz + red_blocksz -1)/ red_blocksz ) ; + dim3 red_grid( num_reduce_blocks ) ; + dim3 red_block( red_blocksz ) ; + + int32_t *block_sum; + //cudaMallocManaged ((void**) &block_sum, (num_reduce_blocks)*sizeof(int32_t)) ; + block_sum = (int32_t*)GB_cuda_malloc( (num_reduce_blocks)*sizeof(int32_t)) ; + + GBURBLE ("(GPU reduce launch nblocks,blocksize= %d,%d )\n", num_reduce_blocks, red_blocksz) ; + jit::launcher( reduce_kernel_name + "_" + semiring_name, + jit_template, + header_names, + compiler_flags, + callback_wrapper) + .set_kernel_inst( reduce_kernel_name , { ctype->name }) + .configure(red_grid, red_block) //if commented, use implicit 1D configure in launch + .launch( + C->i, // index vector, only sum up values >= 0 + C->x, // input pointer to vector to reduce, with zombies + block_sum, // Block sums on return + (unsigned int)cnz // length of vector to reduce to scalar + + ); + + cudaDeviceSynchronize(); + + int32_t num_triangles = 0; + for (int i = 0; i< num_reduce_blocks; i++){ + //printf("block%d num_triangles = %d\n", i, block_sum[i] ); + num_triangles += block_sum[i] ; + } + printf("num_triangles = %d\n", num_triangles ); + + GB_cuda_free( block_sum ); + //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), cudaCpuDeviceId, NULL) ; //stream_data ) ; + //cudaMemPrefetchAsync( C->i, cnz * sizeof (int64_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ; + //cudaMemPrefetchAsync( C->x, cnz * sizeof (int32_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ; + /* + cudaMemcpy( Citemp, C->i, cnz * sizeof( int64_t), cudaMemcpyDefault ); + cudaMemcpy( Cxtemp, C->x, cnz * C->type->size, cudaMemcpyDefault ); + GB_cuda_free( C->i); + GB_cuda_free( C->x); + C->i = Citemp; + C->x = Cxtemp; + */ + + cudaDeviceSynchronize(); + + return GrB_SUCCESS; +} + diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cu b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cu new file mode 100644 index 0000000000..79083969f5 --- /dev/null +++ b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cu @@ -0,0 +1,41 @@ + +// Decide branch direction for GPU use for the dot-product MxM +extern "C" +{ + #include "GB_mxm.h" +} +#include "GB_cuda.h" + +bool GB_AxB_dot3_cuda_branch +( + const GrB_Matrix M, // mask matrix + const bool Mask_struct, // if true, use the only structure of M + const GrB_Matrix A, // input matrix + const GrB_Matrix B, // input matrix + const GrB_Semiring semiring, // semiring that defines C=A*B + const bool flipxy, // if true, do z=fmult(b,a) vs fmult(a,b) + GB_Context Context +) +{ + // very rough estimate of the work to do + double adeg = ((double) GB_NNZ (A)) / ((double) GB_IMAX (1, A->nvec)) ; + double bdeg = ((double) GB_NNZ (B)) / ((double) GB_IMAX (1, B->nvec)) ; + double work = GB_NNZ (M) * GB_IMIN (adeg, bdeg) ; + + // TODO if A or B are not accessed (first, 2nd, or pair ops) + // then the type if A can be user-defined here, for CUDA. + + int ngpus_to_use = GB_ngpus_to_use (work) ; + GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ; + if (ngpus_to_use > 0 && semiring->semiring_is_builtin + && (A->type->code != GB_UDT_code) + && (B->type->code != GB_UDT_code)) + { + return true; + } + else + { + return false; + } + +} diff --git a/GraphBLAS/CUDA/GB_callback.hpp b/GraphBLAS/CUDA/GB_callback.hpp new file mode 100644 index 0000000000..f277a0e320 --- /dev/null +++ b/GraphBLAS/CUDA/GB_callback.hpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Implementations of string callbacks +#include +#pragma once +// Define function pointer we will use later +//std::istream* (*file_callback)(std::string, std::iostream&); + +// Define a factory class for building any buffer of text +class GB_callback { + char *callback_string; + const char *include_filename; + public: + void load_string(const char *fname, char *input){ + callback_string = input; + include_filename = fname; + } + std::istream* callback( std::string filename, std::iostream& tmp_stream) { + if ( filename == std::string(this->include_filename) ) + { + tmp_stream << this->callback_string; + return &tmp_stream; + } + else + { + return nullptr; + } + } +}; + diff --git a/GraphBLAS/CUDA/GB_cuda.h b/GraphBLAS/CUDA/GB_cuda.h new file mode 100644 index 0000000000..d8ecf291b6 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda.h @@ -0,0 +1,98 @@ +//------------------------------------------------------------------------------ +// GB_cuda.h: definitions for using CUDA in GraphBLAS +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS/CUDA, (c) NVIDIA Corp. 2017-2019, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files. + +#ifndef GB_CUDA_H +#define GB_CUDA_H + +// nvcc chokes on the 'restrict' keyword, so define it to the empty string +// for compiling the *.cu files. +#define restrict + +// nvcc also chokes on fpclassify (an ANSI C11 construct that does not appear +// in C++11, it seems). It also issues spurious warnings about compiler +// pragmas. Source/GB.h avoids these constructs if GB_NVCC is defined. +#define GB_NVCC + +extern "C" +{ + #include "GB.h" + #include "GB_Global.h" + #include + #include +} + +// GB_cuda_gateway.h is also included in Source/GB* files, which are not +// compiled with nvcc, nor do they see the cuda.h or cuda_runtime.h +// definitions. Thus, this #include comes first. +#include "GB_cuda_gateway.h" + +#include "GB_cuda_global.h" + +// Finally, include the CUDA definitions +#include "cuda.h" +#include "cuda_runtime.h" +#include "jitify.hpp" + +#include +#include +#include + +#define CHECK_CUDA_SIMPLE(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + const char* str = cudaGetErrorName( err); \ + std::cout << "(CUDA runtime) returned " << str; \ + std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \ + << "())" << std::endl; \ + return (GrB_PANIC) ; \ + } \ + } while (0) + + +//------------------------------------------------------------------------------ +// GB_CUDA_CATCH: catch error from a try { ... } region +//------------------------------------------------------------------------------ + +// Usage: Must be used in a GB* function that returns GrB_Info, and has a +// GB_Context Context parameter. +// +// #define GB_FREE_ALL { some macro to free all temporaries } +// GrB_Info info ; +// try { ... do stuff that can through an exception } +// GB_CUDA_CATCH (info) ; + +#define GB_CUDA_CATCH(info) \ + catch (std::exception& e) \ + { \ + printf ("CUDA error: %s\n", e.what ( )) ; \ + info = GrB_PANIC ; \ + /* out_of_memory : info = GrB_OUT_OF_MEMORY ; */ \ + /* nulltpr: info = ... ; */ \ + /* no gpus here: info = GrB_PANIC ; */ \ + } \ + if (info != GrB_SUCCESS) \ + { \ + /* CUDA failed */ \ + GB_FREE_ALL ; \ + return (GB_ERROR (info, (GB_LOG, "CUDA died\n"))) ; \ + } + +// 12 buckets: computed by up to 11 kernel launches (zombies need no work...), +// using 5 different kernels (with different configurations depending on the +// bucket). + #include "GB_cuda_buckets.h" +extern "C" +{ + #include "GB_cuda_stringify.h" + +} +#endif diff --git a/GraphBLAS/CUDA/GB_cuda_buckets.h b/GraphBLAS/CUDA/GB_cuda_buckets.h new file mode 100644 index 0000000000..4c616c5252 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_buckets.h @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ +// GB_cuda_buckets.h: definitions for buckets using for dot3 +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +// This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files. + +#ifndef GB_CUDA_BUCKETS_H +#define GB_CUDA_BUCKETS_H + +// nvcc chokes on the 'restrict' keyword, so define it to the empty string +// for compiling the *.cu files. +#define restrict + +// nvcc also chokes on fpclassify (an ANSI C11 construct that does not appear +// in C++11, it seems). It also issues spurious warnings about compiler +// pragmas. Source/GB.h avoids these constructs if GB_NVCC is defined. +#define GB_NVCC + + +// 12 buckets: computed by up to 11 kernel launches (zombies need no work...), +// using 5 different kernels (with different configurations depending on the +// bucket). +typedef enum +{ + // bring out your dead: + GB_BUCKET_ZOMBIE = 0, // C(i,j) is a zombie (not a bucket) + +// dot3: C=A'B, M is sparse or hyper, C is sparse or hyper +// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x M is (sparse/hyper) + +// a full/full kernel: + // CUDA kernel: dndn, handles a single bucket: + // both A(:,i) and B(:,j) are dense + GB_BUCKET_DNDN = 1, + +// two full/(sparse,hyper) kernels: + // CUDA kernel: spdn, handles 4 buckets: + // A(:,i) is dense and B(:,j) is very sparse (< 256 entries) + GB_BUCKET_DNVS = 2, + // A(:,i) is dense and B(:,j) is sparse (>= 256 entries) + GB_BUCKET_DNSP = 3, + +// a sparse/full kernel + // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense + GB_BUCKET_VSDN = 4, + // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense + GB_BUCKET_SPDN = 5, + +// a sparse/bitmap kernel +// a bitmap/bitmap kernel +// a bitmap/sparse kernel +// ... + + +// sparse/sparse: + // CUDA kernel: vssp, handles 1 bucket, uses binary search: + // A(:,i) is very sparse compared to B(:,j), or visa versa + GB_BUCKET_VSSP = 6, + + // CUDA kernel: vsvs, handles 4 buckets: + // let len = nnz (A (:,i) + nnz (B (:,j)), then: + GB_BUCKET_VSVS_4 = 7, // len <= 4 + GB_BUCKET_VSVS_16 = 8, // len <= 16 + GB_BUCKET_VSVS_64 = 9, // len <= 64 + GB_BUCKET_VSVS_256 = 10, // len <= 256 + + // CUDA kernel: mp, use the merge-path method: + GB_BUCKET_MERGEPATH = 11, + + // CUDA kernel: warpix, use the warp-intersect method, unused so far: + GB_BUCKET_WARP_IX = 12 +} +GB_bucket_code ; + +#endif diff --git a/GraphBLAS/CUDA/GB_cuda_calloc.cu b/GraphBLAS/CUDA/GB_cuda_calloc.cu new file mode 100644 index 0000000000..4d5d46bf95 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_calloc.cu @@ -0,0 +1,31 @@ +//------------------------------------------------------------------------------ +// GB_cuda_calloc.cu: wrapper for cudaMallocManaged and memset +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" + +void *GB_cuda_calloc (size_t n, size_t size) // standcard calloc signature +{ + + // malloc the space + void *p = GB_cuda_malloc (n * size) ; + + if (p == NULL) + { + // out of memory, or other CUDA error + return (NULL) ; + } + + // set the space to zero + memset (p, 0, n * size) ; + + // return the result + return (p) ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_cumsum.cu b/GraphBLAS/CUDA/GB_cuda_cumsum.cu new file mode 100644 index 0000000000..f3dc45569e --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_cumsum.cu @@ -0,0 +1,67 @@ +//------------------------------------------------------------------------------ +// GB_cuda_cumsum: cumlative sum of an array using GPU acceleration +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +// Compute the cumulative sum of an array count[0:n], of size n+1 +// in pseudo-MATLAB notation: + +// k = sum (count [0:n-1] != 0) ; + +// count = cumsum ([0 count[0:n-1]]) ; + +// That is, count [j] on input is overwritten with the value of +// sum (count [0..j-1]). count [n] is implicitly zero on input. +// On output, count [n] is the total sum. + +#include "GB_cuda.h" +#include + +GrB_Info GB_cuda_cumsum // compute the cumulative sum of an array +( + int64_t *restrict count, // size n+1, input/output + const int64_t n +) +{ + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + + ASSERT (count != NULL) ; + ASSERT (n >= 0) ; + + //-------------------------------------------------------------------------- + // count = cumsum ([0 count[0:n-1]]) ; + //-------------------------------------------------------------------------- + void *d_temp_storage = NULL; + size_t temp_storage_bytes; + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, count, count, (int)n); + d_temp_storage = GB_malloc_memory( temp_storage_bytes, 1); + if ( d_temp_storage == NULL){ + return GrB_OUT_OF_MEMORY; + } + + // Run + CubDebugExit(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, count, count, n)); + + // Check for correctness (and display results, if specified) + #ifdef GB_DEBUG + int compare = CompareDeviceResults(h_reference, count, num_items, true, g_verbose); + ASSERT( compare == 0); + #endif + + // Cleanup + GB_free_memory(d_temp_storage ) ; + + return GrB_SUCCESS; +} + + + + + diff --git a/GraphBLAS/CUDA/GB_cuda_free.cu b/GraphBLAS/CUDA/GB_cuda_free.cu new file mode 100644 index 0000000000..1fa21b3dfc --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_free.cu @@ -0,0 +1,19 @@ +//------------------------------------------------------------------------------ +// GB_cuda_free.cu: wrapper for cudaFree +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" +#include "rmm/detail/cnmem.h" + +void GB_cuda_free (void *p) // standard free signature +{ + cnmemFree( p , NULL); + //printf(" GPU %d freeing mem\n", device); +} + diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu new file mode 100644 index 0000000000..cf7ed54962 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu @@ -0,0 +1,21 @@ +//------------------------------------------------------------------------------ +// GB_cuda_get_device_count.cu: find out how many GPUs exist +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" + +bool GB_cuda_get_device_count // true if OK, false if failure +( + int *gpu_count // return # of GPUs in the system +) +{ + cudaError_t err = cudaGetDeviceCount (gpu_count) ; + return (err == cudaSuccess) ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu new file mode 100644 index 0000000000..d3b48f1f64 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu @@ -0,0 +1,97 @@ +//------------------------------------------------------------------------------ +// GB_cuda_get_device_properties.cu: get the properties of a GPU +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" + +bool GB_cuda_get_device ( int &device){ + bool goodreturn = false; + if (&device == NULL) + { + // invalid inputs + return (false) ; + } + + CHECK_CUDA_SIMPLE ( cudaGetDevice( &device ) ); + goodreturn = true; + + return goodreturn; + +} + +bool GB_cuda_set_device( int device) { + bool goodreturn = false; + if (device < 0) + { + // invalid inputs + return (false) ; + } + + CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ); + goodreturn = true; + + return goodreturn; +} + +bool GB_cuda_get_device_properties // true if OK, false if failure +( + int device, + GB_cuda_device *prop +) +{ + + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + bool goodreturn = false; + if (prop == NULL || device < 0) + { + // invalid inputs + return (false) ; + } + + int old_device; + CHECK_CUDA_SIMPLE ( cudaGetDevice( &old_device ) ) ; + + + //-------------------------------------------------------------------------- + // get the properties + //-------------------------------------------------------------------------- + int num_sms; + int compute_capability_major; + int compute_capability_minor; + size_t memfree, memtotal; + + CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&num_sms, + cudaDevAttrMultiProcessorCount, + device) ); + CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_major, + cudaDevAttrComputeCapabilityMajor, + device) ); + CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_minor, + cudaDevAttrComputeCapabilityMajor, + device) ); + + CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ); + CHECK_CUDA_SIMPLE ( cudaMemGetInfo( & memfree, &memtotal) ) ; + CHECK_CUDA_SIMPLE ( cudaSetDevice( old_device ) ); + + prop->total_global_memory = memtotal; + prop->number_of_sms = num_sms; + prop->compute_capability_major = compute_capability_major; + prop->compute_capability_minor = compute_capability_minor; + + goodreturn = true; + //-------------------------------------------------------------------------- + // return result + //-------------------------------------------------------------------------- + + return goodreturn; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_global.cpp b/GraphBLAS/CUDA/GB_cuda_global.cpp new file mode 100644 index 0000000000..fbe0d168d8 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_global.cpp @@ -0,0 +1,24 @@ +//------------------------------------------------------------------------------ +// GB_cuda_global.cpp: accessor functions for global GraphBLAS/CUDA variables +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS/CUDA, (c) NVIDIA Corp. 2017-2019, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#include "GB_callback.hpp" + +//Global definition required here, sorry +GB_callback *SR_callback_ptr; // thunk + +std::istream* callback_wrapper +( + std::string file_name, // string with the requested "file" name + std::iostream& file_stream // the I/O stream for the "file" contents +) +{ + return SR_callback_ptr->callback (file_name, file_stream) ; +} + + diff --git a/GraphBLAS/CUDA/GB_cuda_global.h b/GraphBLAS/CUDA/GB_cuda_global.h new file mode 100644 index 0000000000..8b26e728a0 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_global.h @@ -0,0 +1,18 @@ +//------------------------------------------------------------------------------ +// GB_cuda_global.h: global variables needed for GraphBLAS/CUDA +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS/CUDA, (c) NVIDIA Corp. 2017-2019, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#ifndef GB_CUDA_GLOBAL_H +#define GB_CUDA_GLOBAL_H + +#include "GB_callback.hpp" + +std::istream* callback_wrapper( std::string file_name, std::iostream& tmp) ; + +#endif + diff --git a/GraphBLAS/CUDA/GB_cuda_malloc.cu b/GraphBLAS/CUDA/GB_cuda_malloc.cu new file mode 100644 index 0000000000..64c5928961 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_malloc.cu @@ -0,0 +1,24 @@ +//------------------------------------------------------------------------------ +// GB_cuda_malloc.cu: wrapper for cuda Managed Memory allocator, or pool +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" +#include "rmm/detail/cnmem.h" + +void *GB_cuda_malloc (size_t size) // standard malloc signature +{ + void *p = NULL ; + + cnmemMalloc( &p, size, NULL); + + return p; + + +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringifier.cpp b/GraphBLAS/CUDA/GB_cuda_stringifier.cpp new file mode 100644 index 0000000000..52f90dfab7 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringifier.cpp @@ -0,0 +1,877 @@ +// Class to manage both stringify functions from semiring, ops and monoids to char buffers +// Also provides a iostream callback to deliver the buffer to jitify as if read from a file + +// (c) Nvidia Corp. 2020 All rights reserved +// SPDX-License-Identifier: Apache-2.0 + +// Implementations of string callbacks +#pragma once +#include +#include "GB.h" +#include "GB_cuda_stringify.h" + +// Define function pointer we will use later +//std::istream* (*file_callback)(std::string, std::iostream&); + +// Define a factory class for building any buffer of text +class GB_cuda_stringifier { + char callback_buffer[2048]; + char *callback_string; + const char *include_filename; + + public: + +//------------------------------------------------------------------------------ +// load string: set string and file name to mimic +//------------------------------------------------------------------------------ + void load_string(const char *fname, char *input) + { + callback_string = input; + include_filename = fname; + } + +//------------------------------------------------------------------------------ +// callback: return string as if it was read from a file +//------------------------------------------------------------------------------ + + std::istream* callback( std::string filename, std::iostream& tmp_stream) + { + if ( filename == std::string(this->include_filename) ) + { + tmp_stream << this->callback_string; + return &tmp_stream; + } + else + { + return nullptr; + } + } + +//------------------------------------------------------------------------------ +// stringify_identity: return string for identity value +//------------------------------------------------------------------------------ +#define ID( x) IDENT = (x) + void stringify_identity + ( + // output: + char *code_string, // string with the #define macro + // input: + GB_Opcode opcode, // must be a built-in binary operator from a monoid + GB_Type_code zcode // type code used in the opcode we want + ) + { + const char *IDENT; + switch (opcode) + { + case GB_MIN_opcode : + + switch (zcode) + { + case GB_BOOL_code : ID ("true") ; // boolean AND + case GB_INT8_code : ID ("INT8_MAX") ; + case GB_INT16_code : ID ("INT16_MAX") ; + case GB_INT32_code : ID ("INT32_MAX") ; + case GB_INT64_code : ID ("INT64_MAX") ; + case GB_UINT8_code : ID ("UINT8_MAX") ; + case GB_UINT16_code : ID ("UINT16_MAX") ; + case GB_UINT32_code : ID ("UINT32_MAX") ; + case GB_UINT64_code : ID ("UINT64_MAX") ; + default : ID ("INFINITY") ; + } + break ; + + case GB_MAX_opcode : + + switch (zcode) + { + case GB_BOOL_code : ID ("false") ; // boolean OR + case GB_INT8_code : ID ("INT8_MIN") ; + case GB_INT16_code : ID ("INT16_MIN") ; + case GB_INT32_code : ID ("INT32_MIN") ; + case GB_INT64_code : ID ("INT64_MIN") ; + case GB_UINT8_code : ID ("0") ; + case GB_UINT16_code : ID ("0") ; + case GB_UINT32_code : ID ("0") ; + case GB_UINT64_code : ID ("0") ; + default : ID ("(-INFINITY)") ; + } + break ; + + case GB_PLUS_opcode : ID ("0") ; + case GB_TIMES_opcode : ID ("1") ; + case GB_LOR_opcode : ID ("false") ; + case GB_LAND_opcode : ID ("true") ; + case GB_LXOR_opcode : ID ("false") ; + // case GB_LXNOR_opcode : + case GB_EQ_opcode : ID ("true") ; + // case GB_ANY_opcode : + default : ID ("0") ; + } + snprintf (code_string, GB_CUDA_STRLEN, "#define GB_IDENTITY (%s)", IDENT) ; + + } + + + const char *GB_cuda_stringify_opcode + ( + GB_Opcode opcode // opcode of GraphBLAS operator + ) + { + switch (opcode) + { + case GB_FIRST_opcode : return ("1st") ; + // case GB_ANY_opcode : return ("any") ; + case GB_SECOND_opcode : return ("2nd") ; + case GB_MIN_opcode : return ("min") ; + case GB_MAX_opcode : return ("max") ; + case GB_PLUS_opcode : return ("plus") ; + case GB_MINUS_opcode : return ("minus") ; + case GB_RMINUS_opcode : return ("rminus") ; + case GB_TIMES_opcode : return ("times") ; + case GB_DIV_opcode : return ("div") ; + case GB_RDIV_opcode : return ("rdiv") ; + case GB_EQ_opcode : return ("eq") ; + case GB_ISEQ_opcode : return ("iseq") ; + case GB_NE_opcode : return ("ne") ; + case GB_ISNE_opcode : return ("isne") ; + case GB_GT_opcode : return ("gt") ; + case GB_ISGT_opcode : return ("isgt") ; + case GB_LT_opcode : return ("lt") ; + case GB_ISLT_opcode : return ("islt") ; + case GB_GE_opcode : return ("ge") ; + case GB_ISGE_opcode : return ("isge") ; + case GB_LE_opcode : return ("le") ; + case GB_ISLE_opcode : return ("isle") ; + case GB_LOR_opcode : return ("lor") ; + case GB_LAND_opcode : return ("land") ; + case GB_LXOR_opcode : return ("lxor") ; + // case GB_BOR_opcode : ... bitwise ops + // x | y, etc + // case GB_PAIR_opcode : + default : ; + } + + return ("") ; + } + + void stringify_binop + ( + // output: + char *code_string, // string with the #define macro + // input: + const char *macro_name, // name of macro to construct + GB_Opcode opcode, // opcode of GraphBLAS operator to convert into a macro + GB_Type_code zcode // op->ztype->code of the operator + ) + { + + // The binop macro generates an expression, not a full statement (there + // is no semicolon). + + // for example: + // #define GB_MULT(x,y) ((x) * (y)) + + const char *f ; + + switch (opcode) + { + + case GB_FIRST_opcode : // 7: z = x + + f = "(x)" ; + break ; + + // case GB_ANY_opcode : + case GB_SECOND_opcode : // 8: z = y + + f = "(y)" ; + break ; + + case GB_MIN_opcode : // 9: z = min(x,y) + + switch (zcode) + { + case GB_BOOL_code : f = "(x) && (y)" ; + case GB_FP32_code : f = "fminf (x,y)" ; + case GB_FP64_code : f = "fmin (x,y)" ; + default : f = "GB_IMIN (x,y)" ; + } + break ; + + case GB_MAX_opcode : // 10: z = max(x,y) + + switch (zcode) + { + case GB_BOOL_code : f = "(x) || (y)" ; + case GB_FP32_code : f = "fmaxf (x,y)" ; + case GB_FP64_code : f = "fmax (x,y)" ; + default : f = "GB_IMAX (x,y)" ; + } + break ; + + case GB_PLUS_opcode : // 11: z = x + y + + switch (zcode) + { + case GB_BOOL_code : f = "(x) || (y)" ; + default : f = "(x) + (y)" ; + } + break ; + + case GB_MINUS_opcode : // 12: z = x - y + + switch (zcode) + { + case GB_BOOL_code : f = "(x) != (y)" ; + default : f = "(x) - (y)" ; + } + break ; + + case GB_RMINUS_opcode : // 13: z = y - x + + switch (zcode) + { + case GB_BOOL_code : f = "(x) != (y)" ; + default : f = "(y) - (x)" ; + } + break ; + + case GB_TIMES_opcode : // 14: z = x * y + + switch (zcode) + { + case GB_BOOL_code : f = "(x) && (y)" ; + default : f = "(x) * (y)" ; + } + break ; + + case GB_DIV_opcode : // 15: z = x / y ; + + switch (zcode) + { + case GB_BOOL_code : f = "(x)" ; + case GB_INT8_code : f = "GB_IDIV_SIGNED (x,y,8)" ; + case GB_INT16_code : f = "GB_IDIV_SIGNED (x,y,16)" ; + case GB_INT32_code : f = "GB_IDIV_SIGNED (x,y,32)" ; + case GB_INT64_code : f = "GB_IDIV_SIGNED (x,y,64)" ; + case GB_UINT8_code : f = "GB_IDIV_UNSIGNED (x,y,8)" ; + case GB_UINT16_code : f = "GB_IDIV_UNSIGNED (x,y,16)" ; + case GB_UINT32_code : f = "GB_IDIV_UNSIGNED (x,y,32)" ; + case GB_UINT64_code : f = "GB_IDIV_UNSIGNED (x,y,64)" ; + default : f = "(x) / (y)" ; + } + break ; + + case GB_RDIV_opcode : // z = y / x ; + + switch (zcode) + { + case GB_BOOL_code : f = "(x)" ; + case GB_INT8_code : f = "GB_IDIV_SIGNED (y,x,8)" ; + case GB_INT16_code : f = "GB_IDIV_SIGNED (y,x,16)" ; + case GB_INT32_code : f = "GB_IDIV_SIGNED (y,x,32)" ; + case GB_INT64_code : f = "GB_IDIV_SIGNED (y,x,64)" ; + case GB_UINT8_code : f = "GB_IDIV_UNSIGNED (y,x,8)" ; + case GB_UINT16_code : f = "GB_IDIV_UNSIGNED (y,x,16)" ; + case GB_UINT32_code : f = "GB_IDIV_UNSIGNED (y,x,32)" ; + case GB_UINT64_code : f = "GB_IDIV_UNSIGNED (y,x,64)" ; + default : f = "(y) / (x)" ; + } + break ; + + case GB_EQ_opcode : + case GB_ISEQ_opcode : // 17: z = (x == y) + + f = "(x) == (y)" ; + break ; + + case GB_NE_opcode : + case GB_ISNE_opcode : // 18: z = (x != y) + + f = "(x) != (y)" ; + break ; + + case GB_GT_opcode : + case GB_ISGT_opcode : // 19: z = (x > y) + + f = "(x) > (y)" ; + break ; + + case GB_LT_opcode : + case GB_ISLT_opcode : // 20: z = (x < y) + + f = "(x) < (y)" ; + break ; + + case GB_GE_opcode : + case GB_ISGE_opcode : // 21: z = (x >= y) + + f = "(x) >= (y)" ; + break ; + + case GB_LE_opcode : + case GB_ISLE_opcode : // 22: z = (x <= y) + + f = "(x) <= (y)" ; + break ; + + case GB_LOR_opcode : // 23: z = (x != 0) || (y != 0) + + switch (zcode) + { + case GB_BOOL_code : f = "(x) || (y)" ; + default : f = "((x) != 0) || ((y) != 0)" ; + } + break ; + + case GB_LAND_opcode : // 23: z = (x != 0) && (y != 0) + + switch (zcode) + { + case GB_BOOL_code : f = "(x) && (y)" ; + default : f = "((x) != 0) && ((y) != 0)" ; + } + break ; + + case GB_LXOR_opcode : // 25: z = (x != 0) != (y != 0) + + switch (zcode) + { + case GB_BOOL_code : f = "(x) != (y)" ; + default : f = "((x) != 0) != ((y) != 0)" ; + } + break ; + + // case GB_BOR_opcode : ... bitwise ops + // x | y, etc + + // case GB_PAIR_opcode : + default : + + f = "1" ; + break ; + } + + snprintf (code_string, GB_CUDA_STRLEN, + "#define %s(x,y) (%s)", macro_name, f) ; + } + + + void stringify_terminal + ( + // outputs: + bool *is_monoid_terminal, + char *terminal_condition, + char *terminal_statement, + // inputs: + const char *macro_condition_name, + const char *macro_statement_name, + GB_Opcode opcode, // must be a built-in binary operator from a monoid + GB_Type_code zcode // op->ztype->code + ) + { + //------------------------------------------------------------------------------ + // GB_cuda_stringify_terminal: string to check terminal condition + //------------------------------------------------------------------------------ + + // The macro_condition_name(cij) should return true if the value of cij has + // reached its terminal value, or false otherwise. If the monoid is not + // terminal, then the macro should always return false. The ANY monoid + // should always return true. + + // The macro_statement_name is a macro containing a full statement. If the + // monoid is never terminal, it becomes the empty statement (";"). Otherwise, + // it checks the terminal condition and does a "break" if true. + + + //-------------------------------------------------------------------------- + // determine if the monoid is terminal, and find its terminal value + //-------------------------------------------------------------------------- + + bool is_terminal = false ; + const char *f = NULL ; + + switch (opcode) + { + + #if 0 + case GB_ANY_opcode : + f = NULL ; + is_terminal = true ; + break ; + #endif + + case GB_MIN_opcode : + + is_terminal = true ; + switch (zcode) + { + case GB_BOOL_code : f = "false" ; break ; + case GB_INT8_code : f = "INT8_MIN" ; break ; + case GB_INT16_code : f = "INT16_MIN" ; break ; + case GB_INT32_code : f = "INT32_MIN" ; break ; + case GB_INT64_code : f = "INT64_MIN" ; break ; + case GB_UINT8_code : f = "0" ; break ; + case GB_UINT16_code : f = "0" ; break ; + case GB_UINT32_code : f = "0" ; break ; + case GB_UINT64_code : f = "0" ; break ; + default : f = "(-INFINITY)" ; break ; + } + break ; + + case GB_MAX_opcode : + + is_terminal = true ; + switch (zcode) + { + case GB_BOOL_code : f = "true" ; break ; + case GB_INT8_code : f = "INT8_MAX" ; break ; + case GB_INT16_code : f = "INT16_MAX" ; break ; + case GB_INT32_code : f = "INT32_MAX" ; break ; + case GB_INT64_code : f = "INT64_MAX" ; break ; + case GB_UINT8_code : f = "UINT8_MAX" ; break ; + case GB_UINT16_code : f = "UINT16_MAX" ; break ; + case GB_UINT32_code : f = "UINT32_MAX" ; break ; + case GB_UINT64_code : f = "UINT64_MAX" ; break ; + default : f = "INFINITY" ; break ; + } + break ; + + case GB_PLUS_opcode : + + if (zcode == GB_BOOL_code) + { + f = "true" ; // boolean OR + is_terminal = true ; + } + else + { + f = NULL ; + is_terminal = false ; + } + break ; + + case GB_TIMES_opcode : + + switch (zcode) + { + case GB_BOOL_code : // boolean AND + case GB_INT8_code : + case GB_INT16_code : + case GB_INT32_code : + case GB_INT64_code : + case GB_UINT8_code : + case GB_UINT16_code : + case GB_UINT32_code : + case GB_UINT64_code : + f = "0" ; + is_terminal = true ; + break ; + default : + f = NULL ; + is_terminal = false ; + break ; + } + break ; + + case GB_LOR_opcode : f = "true" ; is_terminal = true ; break ; + case GB_LAND_opcode : f = "false" ; is_terminal = true ; break ; + + case GB_LXOR_opcode : + // case GB_LXNOR_opcode : + case GB_EQ_opcode : + default : + // the monoid is not terminal + f = NULL ; + is_terminal = false ; + break ; + } + + //-------------------------------------------------------------------------- + // construct the macro to test the terminal condition + //-------------------------------------------------------------------------- + + if (is_terminal) + { + // the monoid is terminal + if (f == NULL) + { + // ANY monoid + snprintf (terminal_condition, GB_CUDA_STRLEN, + "#define %s(cij) true", macro_condition_name) ; + snprintf (terminal_statement, GB_CUDA_STRLEN, + "#define %s break", macro_statement_name) ; + } + else + { + // typical terminal monoids: check if C(i,j) has reached its + // terminal value + snprintf (terminal_condition, GB_CUDA_STRLEN, + "#define %s(cij) ((cij) == %s)", macro_condition_name, f) ; + snprintf (terminal_statement, GB_CUDA_STRLEN, + "#define %s if (%s (cij)) break", + macro_statement_name, macro_condition_name) ; + } + } + else + { + // the monoid is not terminal: the condition is always false + snprintf (terminal_condition, GB_CUDA_STRLEN, "#define %s(cij) false", + macro_condition_name) ; + snprintf (terminal_statement, GB_CUDA_STRLEN, "#define %s", + macro_statement_name) ; + } + + (*is_monoid_terminal) = is_terminal ; + } + + + //-------------------------------------------------------------------------- + // Handle mask type and structural vs not + //-------------------------------------------------------------------------- + const char *stringify_mask + ( + const GB_Type_code M_type_code, + bool mask_is_structural + ) + { + + if (mask_is_structural) + { + return ( + "#define GB_MTYPE void\n" + "#define MX(i) true") ; + } + else + { + switch (M_type_code) + { + case GB_BOOL_code: + case GB_INT8_code: + case GB_UINT8_code: + return ( + "#define GB_MTYPE uint8_t\n" + "#define MX(i) Mx [i]") ; + + case GB_INT16_code: + case GB_UINT16_code: + return ( + "#define GB_MTYPE uint16_t\n" + "#define MX(i) Mx [i]") ; + + case GB_INT32_code: + case GB_UINT32_code: + // case GB_FC32_code: + case GB_FP32_code: + return ( + "#define GB_MTYPE uint32_t\n" + "#define MX(i) Mx [i]") ; + + case GB_INT64_code: + case GB_UINT64_code: + // case GB_FC64_code: + case GB_FP64_code: + return ( + "#define GB_MTYPE uint64_t\n" + "#define MX(i) Mx [i]") ; + + // case GB_FC64_code: + // return ( + // "#define GB_MTYPE double complex\n" + // "#define MX(i) Mx [i]") ; + + default: ; + } + } + + // unrecognized type + return (NULL) ; + } + +// Construct a macro to load and typecast. For example: +// +// #define GB_GETA(blob) blob +// +// then use as: +// GB_GETA (double aij = Ax [p]) ; +// GB_GETA (double *Ax = A->x) ; +// GB_GETA (T_A *restrict Ax = A->x) ; +// +// which become +// double aij = Ax [p] ; +// double *Ax = A->x ; +// T_A *Ax = A->x ; +// +// or, if is_pattern is true, the macro becomes the empty string. + + void stringify_load {} + ( + // output: + char *result, + // input: + const char *macro_name, // name of macro to construct + bool is_pattern // if true, load/cast does nothing + ) + { + + if (is_pattern) + { + snprintf (result, GB_CUDA_STRLEN, "#define %s(blob)", macro_name) ; + } + else + { + snprintf (result, GB_CUDA_STRLEN, "#define %s(blob) blob", macro_name) ; + } + } + + void stringify_semiring {} + + // Construct a string defining a semiring. + // User-defined types are not handled. + // build a semiring (name and code) + ( + // input: + GrB_Semiring semiring, // the semiring to stringify + bool flipxy, // multiplier is: mult(a,b) or mult(b,a) + GrB_Type ctype, // the type of C + GrB_Type atype, // the type of A + GrB_Type btype, // the type of B + GrB_Type mtype, // the type of M, or NULL if no mask + bool Mask_struct, // mask is structural + bool mask_in_semiring_name, // if true, then the semiring_name includes + // the mask_name. If false, then semiring_name + // is independent of the mask_name + // output: (all of size at least GB_CUDA_LEN+1) + char *semiring_name, // name of the semiring + char *semiring_code, // List of types and macro defs + char *mask_name // definition of mask data load + ) + { + + // check inputs + ASSERT (semiring->object_kind == GB_BUILTIN) ; + + // get the semiring + GrB_Monoid add = semiring->add ; + GrB_BinaryOp mult = semiring->multiply ; + GrB_BinaryOp addop = add->op ; + GrB_Type xtype = mult->xtype ; + GrB_Type ytype = mult->ytype ; + GrB_Type ztype = mult->ztype ; + GB_Opcode mult_opcode = mult->opcode ; + GB_Opcode add_opcode = addop->opcode ; + GB_Type_code xcode = xtype->code ; + GB_Type_code ycode = ytype->code ; + GB_Type_code zcode = ztype->code ; + + // these must always be true for any semiring: + ASSERT (mult->ztype == addop->ztype) ; + ASSERT (addop->xtype == addop->ztype && addop->ytype == addop->ztype) ; + + // for now, this is true for all built-in binops: + ASSERT (xcode == ycode) ; + + //-------------------------------------------------------------------------- + // rename redundant boolean operators + //-------------------------------------------------------------------------- + + // consider z = op(x,y) where both x and y are boolean: + // DIV becomes FIRST + // RDIV becomes SECOND + // MIN and TIMES become LAND + // MAX and PLUS become LOR + // NE, ISNE, RMINUS, and MINUS become LXOR + // ISEQ becomes EQ + // ISGT becomes GT + // ISLT becomes LT + // ISGE becomes GE + // ISLE becomes LE + + if (zcode == GB_BOOL_code) + { + // rename the monoid + add_opcode = GB_boolean_rename (add_opcode) ; + } + + if (xcode == GB_BOOL_code) // && (ycode == GB_BOOL_code) + { + // rename the multiplicative operator + mult_opcode = GB_boolean_rename (mult_opcode) ; + } + + //-------------------------------------------------------------------------- + // handle the flip + //-------------------------------------------------------------------------- + + if (flipxy) + { + // z = fmult (b,a) will be computed: handle this by renaming the + // multiplicative operator + + // handle the flip + mult_opcode = GB_binop_flip (mult_opcode) ; + + // the flip is now handled completely. This assumes xtype and ytype + // are the same for all built-in operators. If this changes, the + // types will have to be flipped too. + flipxy = false ; + } + + //-------------------------------------------------------------------------- + // determine if A and/or B are value-agnostic + //-------------------------------------------------------------------------- + + bool op_is_first = (mult_opcode == GB_FIRST_opcode ) ; + bool op_is_second = (mult_opcode == GB_SECOND_opcode) ; + bool op_is_pair = false ; // (mult_opcode == GB_PAIR_opcode) ; + bool A_is_pattern = op_is_second || op_is_pair ; + bool B_is_pattern = op_is_first || op_is_pair ; + + //-------------------------------------------------------------------------- + // construct macros to load scalars from A and B (and typecast) them + //-------------------------------------------------------------------------- + + char acast [GB_CUDA_STRLEN+1] ; + char bcast [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_load (acast, "GB_GETA", A_is_pattern) ; + GB_cuda_stringify_load (bcast, "GB_GETB", B_is_pattern) ; + + //-------------------------------------------------------------------------- + // construct macros for the multiply + //-------------------------------------------------------------------------- + + char mult_function [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_binop (mult_function, "GB_MULT", mult_opcode, zcode) ; + + //-------------------------------------------------------------------------- + // construct the monoid macros + //-------------------------------------------------------------------------- + + char add_function [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_binop (add_function, "GB_ADD", add_opcode, zcode) ; + + char identity_definition [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_identity ( identity_definition, add_opcode, zcode) ; + + bool is_terminal ; + char terminal_condition [GB_CUDA_STRLEN+1] ; + char terminal_statement [GB_CUDA_STRLEN+1] ; + + GB_cuda_stringify_terminal ( + &is_terminal, terminal_condition, terminal_statement, + "GB_TERMINAL_CONDITION", "GB_IF_TERMINAL_BREAK", add_opcode, zcode) ; + + //-------------------------------------------------------------------------- + // macro to typecast the result back into C + //-------------------------------------------------------------------------- + + // for the ANY_PAIR semiring, "c_is_one" will be true, and Cx [0..cnz] will + // be filled with all 1's later. + bool c_is_one = false ; + // TODO: + // (add_opcode == GB_ANY_opcode && mult_opcode == GB_PAIR_opcode) ; + char ccast [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_load (ccast, "GB_PUTC", c_is_one) ; + + //-------------------------------------------------------------------------- + // construct the macros to access the mask (if any), and its name + //-------------------------------------------------------------------------- + + const char *mask_string = "" ; + const char *mask_type_name = "" ; + const char *struct_str = "struct"; + if (mtype != NULL) + { + mask_string = GB_cuda_stringify_mask (mtype->code, Mask_struct) ; + mask_type_name = mtype->name ; + } + else + { + mask_type_name = struct_str; + } + + snprintf (mask_name, GB_CUDA_STRLEN, "mask_%s", mask_type_name) ; + + //-------------------------------------------------------------------------- + // build the final semiring code + //-------------------------------------------------------------------------- + + snprintf (semiring_code, GB_CUDA_STRLEN, + "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n", + acast, bcast, mult_function, add_function, identity_definition, + terminal_condition, terminal_statement, ccast, mask_string) ; + + //-------------------------------------------------------------------------- + // build the final semiring name + //-------------------------------------------------------------------------- + + // the semiring_name depends on: + // add_opcode + // mult_opcode + // ztype->name + // xtype->name (currently, always == ytype->name, but will change (TODO)) + // ytype->name + // ctype->name + // mask_type_name (but only if mask_in_semiring_name is true) + // atype->name + // btype->name + + const char *add_name; + const char *mult_name; + + add_name = GB_cuda_stringify_opcode (add_opcode) ; + mult_name = GB_cuda_stringify_opcode (mult_opcode) ; + + // these are not needed: they are template parameters to the CUDA kernel: + // ztype->name, xtype->name, ytype->name, + // ctype->name, atype->name, btype->name + + // ztype->name is required, since the kernel needs it for the identity + // value. xtype->name is not strictly required. However, the GraphBLAS + // naming scheme is add_mult_xtype, so it is included here. The ytype + // and ztype need not be xtype. + + if (mask_in_semiring_name) + { + + // the format of the semiring name is: + // + // semiring_add_mult_xtype_M_mtype_Z_ztype + + snprintf (semiring_name, GB_CUDA_STRLEN, + "semiring_%s_%s_%s_M_%s_Z_%s", + // The first part is akin to GxB_PLUS_TIMES_FP64 (for example), + // but here this example is semiring_plus_times_double instead: + add_name, mult_name, xtype->name, + // these are not in the GrB* or GxB* name, but are needed by CUDA: + // mask_type_name is (say) 'int64' or 'bool'. + // ztype is the name of the monoid type. + mask_type_name, ztype->name) ; + + } + else + { + + // the format of the semiring name is: + // + // semiring_add_mult_xtype_Z_ztype + + snprintf (semiring_name, GB_CUDA_STRLEN, + "semiring_%s_%s_%s_Z_%s", + // The first part is akin to GxB_PLUS_TIMES_FP64 (for example), + // but here this example is semiring_plus_times_double instead: + add_name, mult_name, xtype->name, + // this is not in the GrB* or GxB* name, but is needed by CUDA: + // ztype is the name of the monoid type. + ztype->name) ; + + } + + printf ("semiring_name:\n%s\n", semiring_name) ; + //printf ("semiring_code:\n%s\n", semiring_code) ; + //printf ("mask_name: \n%s\n", mask_name) ; + } + + +}; + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify.h b/GraphBLAS/CUDA/GB_cuda_stringify.h new file mode 100644 index 0000000000..18cc7464f1 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify.h @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +//------------------------------------------------------------------------------ +// GB_cuda_stringify.h: prototype definitions for using C helpers +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +// This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files. + +#ifndef GB_CUDA_STRINGIFY_H +#define GB_CUDA_STRINGIFY_H + +// length of strings for building semiring code and names +#define GB_CUDA_STRLEN 2048 + +void GB_cuda_stringify_terminal // return strings to check terminal +( + // outputs: + bool *is_monoid_terminal, + char *terminal_condition, + char *terminal_statement, + // inputs: + const char *macro_condition_name, + const char *macro_statement_name, + GB_Opcode opcode, // must be a built-in binary operator from a monoid + GB_Type_code zcode // op->ztype->code +) ; + +const char *GB_cuda_stringify_mask +( + const GB_Type_code M_type_code, + bool mask_is_structural +) ; + +void GB_cuda_stringify_semiring // build a semiring (name and code) +( + // input: + GrB_Semiring semiring, // the semiring to stringify + bool flipxy, // multiplier is: mult(a,b) or mult(b,a) + GrB_Type ctype, // the type of C + GrB_Type atype, // the type of A + GrB_Type btype, // the type of B + GrB_Type mtype, // the type of M, or NULL if no mask + bool Mask_struct, // mask is structural + bool mask_in_semiring_name, // if true, then the semiring_name includes + // the mask_name. If false, then semiring_name + // is independent of the mask_name + // output: (all of size at least GB_CUDA_LEN+1) + char *semiring_name, // name of the semiring + char *semiring_code, // List of types and macro defs + char *mask_name // definition of mask data load +) ; + +void GB_cuda_stringify_binop +( + // output: + char *code_string, // string with the #define macro + // input: + const char *macro_name, // name of macro to construct + GB_Opcode opcode, // opcode of GraphBLAS operator to convert into a macro + GB_Type_code zcode // op->ztype->code of the operator +) ; + +void GB_cuda_stringify_load // return a string to load/typecast macro +( + // output: + char *result, + // input: + const char *macro_name, // name of macro to construct + bool is_pattern // if true, load/cast does nothing +) ; + +void GB_cuda_stringify_identity // return string for identity value +( + // output: + char *code_string, // string with the #define macro + // input: + GB_Opcode opcode, // must be a built-in binary operator from a monoid + GB_Type_code zcode +) ; + +const char *GB_cuda_stringify_opcode +( + GB_Opcode opcode // opcode of GraphBLAS operator +) ; + +GB_Opcode GB_binop_flip // flipped opcode, or same opcode if not flipped +( + GB_Opcode opcode // opcode to flip +) ; + +#endif diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_binop.c b/GraphBLAS/CUDA/GB_cuda_stringify_binop.c new file mode 100644 index 0000000000..c4e42a6379 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_binop.c @@ -0,0 +1,647 @@ +//SPDX-License-Identifier: Apache-2.0 + +#include "GB.h" +#include "GB_cuda_stringify.h" + +// The binop macro generates an expression, not a full statement (there +// is no semicolon). + +// for example: +// #define GB_MULT(x,y) ((x) * (y)) + +void GB_cuda_stringify_binop +( + // output: + char *code_string, // string with the #define macro + // input: + const char *macro_name, // name of macro to construct + GB_Opcode opcode, // opcode of GraphBLAS operator to convert into a macro + GB_Type_code zcode // op->ztype->code of the operator +) +{ + const char *op_string ; + int ecode ; + GB_cuda_enumify_binop (&ecode, opcode, zcode) ; + GB_cuda_charify_binop (&op_string, ecode, for_semiring) ; + GB_cuda_macrofy_binop (code_string, macro_name, op_string) ; +} + +void GB_cuda_enumify_binop +( + // output: + int *ecode, // enumerated operator, in range 0 to ... (-1 on failure) + // input: + GB_Opcode opcode, // opcode of GraphBLAS operator to convert into a macro + GB_Type_code zcode // op->ztype->code of the operator +// bool for_semiring // true for A*B, false for A+B or A.*B (not needed) +) +{ + int e = -1 ; + + switch (opcode) + { + + case GB_FIRST_opcode : // z = x + + e = 0 ; // "(x)" ; + break ; + + case GB_ANY_opcode : + case GB_SECOND_opcode : // z = y + + e = 1 ; // "(y)" ; + break ; + + case GB_MIN_opcode : // z = min(x,y) + + switch (zcode) + { + case GB_FP32_code : e = 2 ; break ; // "fminf (x,y)" ; + case GB_FP64_code : e = 3 ; break ; // "fmin (x,y)" ; + default : e = 4 ; break ; // "GB_IMIN (x,y)" ; + } + break ; + + case GB_MAX_opcode : // z = max(x,y) + + switch (zcode) + { + case GB_FP32_code : e = 5 ; break ; // "fmaxf (x,y)" ; + case GB_FP64_code : e = 6 ; break ; // "fmax (x,y)" ; + default : e = 7 ; break ; // "GB_IMAX (x,y)" ; + } + break ; + + case GB_PLUS_opcode : // z = x + y + + e = 8 ; break ; // "(x) + (y)" ; + + case GB_MINUS_opcode : // z = x - y + + e = 9 ; break ; // "(x) - (y)" ; + + case GB_RMINUS_opcode : // z = y - x + + e = 10 ; break ; // "(y) - (x)" ; + + case GB_TIMES_opcode : // z = x * y + + e = 11 ; break ; // "(x) * (y)" ; + + case GB_DIV_opcode : // z = x / y ; + + switch (zcode) + { + case GB_INT8_code : e = 12 ; break ; // "GB_IDIV_SIGNED(x,y,8)" + case GB_INT16_code : e = 13 ; break ; // "GB_IDIV_SIGNED(x,y,16)" + case GB_INT32_code : e = 14 ; break ; // "GB_IDIV_SIGNED(x,y,32)" + case GB_INT64_code : e = 15 ; break ; // "GB_IDIV_SIGNED(x,y,64)" + case GB_UINT8_code : e = 16 ; break ; // "GB_IDIV_UNSIGNED(x,y,8)" + case GB_UINT16_code : e = 17 ; break ; // "GB_IDIV_UNSIGNED(x,y,16)" + case GB_UINT32_code : e = 18 ; break ; // "GB_IDIV_UNSIGNED(x,y,32)" + case GB_UINT64_code : e = 19 ; break ; // "GB_IDIV_UNSIGNED(x,y,64)" + default : e = 20 ; break ; // "(x) / (y)" + } + break ; + + case GB_RDIV_opcode : // z = y / x ; + + switch (zcode) + { + case GB_INT8_code : e = 21 ; break ; // GB_IDIV_SIGNED(y,x,8) + case GB_INT16_code : e = 22 ; break ; // GB_IDIV_SIGNED(y,x,16)" + case GB_INT32_code : e = 23 ; break ; // GB_IDIV_SIGNED(y,x,32)" + case GB_INT64_code : e = 24 ; break ; // GB_IDIV_SIGNED(y,x,64)" + case GB_UINT8_code : e = 25 ; break ; // GB_IDIV_UNSIGNED(y,x,8)" + case GB_UINT16_code : e = 26 ; break ; // GB_IDIV_UNSIGNED(y,x,16)" + case GB_UINT32_code : e = 27 ; break ; // GB_IDIV_UNSIGNED(y,x,32)" + case GB_UINT64_code : e = 28 ; break ; // GB_IDIV_UNSIGNED(y,x,64)" + default : e = 29 ; break ; // (y) / (x) + } + break ; + + case GB_EQ_opcode : + case GB_ISEQ_opcode : // z = (x == y) + + e = 30 ; // "(x) == (y)" ; + break ; + + case GB_NE_opcode : + case GB_ISNE_opcode : // z = (x != y) + + e = 31 ; // "(x) != (y)" ; + break ; + + case GB_GT_opcode : + case GB_ISGT_opcode : // z = (x > y) + + e = 32 ; // "(x) > (y)" ; + break ; + + case GB_LT_opcode : + case GB_ISLT_opcode : // z = (x < y) + + e = 33 ; // "(x) < (y)" ; + break ; + + case GB_GE_opcode : + case GB_ISGE_opcode : // z = (x >= y) + + e = 34 ; // "(x) >= (y)" ; + break ; + + case GB_LE_opcode : + case GB_ISLE_opcode : // z = (x <= y) + + e = 35 ; // "(x) <= (y)" ; + break ; + + case GB_LOR_opcode : // z = (x != 0) || (y != 0) + + switch (zcode) + { + case GB_BOOL_code : e = 36 ; break ; // "(x) || (y)" + default : e = 37 ; break ; // "((x)!=0) || ((y)!=0)" + } + break ; + + case GB_LAND_opcode : // z = (x != 0) && (y != 0) + + switch (zcode) + { + case GB_BOOL_code : e = 38 ; break ; // "(x) && (y)" + default : e = 39 ; break ; // "((x)!=0) && ((y)!=0)" + } + break ; + + case GB_LXOR_opcode : // z = (x != 0) != (y != 0) + + switch (zcode) + { + case GB_BOOL_code : e = 40 ; break ; // "(x) != (y)" + default : e = 41 ; break ; // "((x)!=0) != ((y)!=0)" + } + break ; + + case GB_BOR_opcode : // z = (x | y), bitwise or + + if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code) + { + e = 42 ; // "(x) | (y)" + } + break ; + + case GB_BAND_opcode : // z = (x & y), bitwise and + + if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code) + { + e = 43 ; // "(x) & (y)" + } + break ; + + case GB_BXOR_opcode : // z = (x ^ y), bitwise xor + + if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code) + { + e = 44 ; // "(x) ^ (y)" + } + break ; + + case GB_BXNOR_opcode : // z = ~(x ^ y), bitwise xnor + + if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code) + { + e = 45 ; // "~((x) ^ (y))" + } + break ; + + case GB_BGET_opcode : // z = bitget (x,y) + + switch (zcode) + { + case GB_INT8_code : e = 46 ; break ; // GB_BITGET(x,y,int8_t, 8) + case GB_INT16_code : e = 47 ; break ; // GB_BITGET(x,y,int16_t,16) + case GB_INT32_code : e = 48 ; break ; // GB_BITGET(x,y,int32_t,32) + case GB_INT64_code : e = 49 ; break ; // GB_BITGET(x,y,int64_t,64) + case GB_UINT8_code : e = 50 ; break ; // GB_BITGET(x,y,uint8_t,8) + case GB_UINT16_code : e = 51 ; break ; // GB_BITGET(x,y,uint16_t,16) + case GB_UINT32_code : e = 52 ; break ; // GB_BITGET(x,y,uint32_t,32) + case GB_UINT64_code : e = 53 ; break ; // GB_BITGET(x,y,uint64_t,64) + default : e = -1 ; break ; + } + break ; + + case GB_BSET_opcode : // z = bitset (x,y) + + switch (zcode) + { + case GB_INT8_code : e = 54 ; break ; // GB_BITSET(x,y,int8_t, 8) + case GB_INT16_code : e = 55 ; break ; // GB_BITSET(x,y,int16_t,16) + case GB_INT32_code : e = 56 ; break ; // GB_BITSET(x,y,int32_t,32) + case GB_INT64_code : e = 57 ; break ; // GB_BITSET(x,y,int64_t,64) + case GB_UINT8_code : e = 58 ; break ; // GB_BITSET(x,y,uint8_t,8) + case GB_UINT16_code : e = 59 ; break ; // GB_BITSET(x,y,uint16_t,16) + case GB_UINT32_code : e = 60 ; break ; // GB_BITSET(x,y,uint32_t,32) + case GB_UINT64_code : e = 61 ; break ; // GB_BITSET(x,y,uint64_t,64) + default : e = -1 ; break ; + } + break ; + + case GB_BCLR_opcode : // z = bitclr (x,y) + + switch (zcode) + { + case GB_INT8_code : e = 62 ; break ; // GB_BITCLR(x,y,int8_t, 8) + case GB_INT16_code : e = 63 ; break ; // GB_BITCLR(x,y,int16_t,16) + case GB_INT32_code : e = 64 ; break ; // GB_BITCLR(x,y,int32_t,32) + case GB_INT64_code : e = 65 ; break ; // GB_BITCLR(x,y,int64_t,64) + case GB_UINT8_code : e = 66 ; break ; // GB_BITCLR(x,y,uint8_t,8) + case GB_UINT16_code : e = 67 ; break ; // GB_BITCLR(x,y,uint16_t,16) + case GB_UINT32_code : e = 68 ; break ; // GB_BITCLR(x,y,uint32_t,32) + case GB_UINT64_code : e = 69 ; break ; // GB_BITCLR(x,y,uint64_t,64) + default : e = -1 ; break ; + } + break ; + + case GB_BSHIFT_opcode : // z = bitshift (x,y) + + switch (zcode) + { + case GB_INT8_code : e = 70 ; break ; // GB_bitshift_int8(x,y) + case GB_INT16_code : e = 71 ; break ; // GB_bitshift_int16(x,y) + case GB_INT32_code : e = 72 ; break ; // GB_bitshift_int32(x,y) + case GB_INT64_code : e = 73 ; break ; // GB_bitshift_int64(x,y) + case GB_UINT8_code : e = 74 ; break ; // GB_bitshift_uint8(x,y) + case GB_UINT16_code : e = 75 ; break ; // GB_bitshift_uint16(x,y) + case GB_UINT32_code : e = 76 ; break ; // GB_bitshift_uint32(x,y) + case GB_UINT64_code : e = 77 ; break ; // GB_bitshift_uint64(x,y) + default : e = -1 ; break ; + } + break ; + + case GB_POW_opcode : // z = pow (x,y) + + switch (zcode) + { + case GB_INT8_code : e = 78 ; break ; // GB_pow_int8 (x, y) + case GB_INT16_code : e = 79 ; break ; // GB_pow_int16 (x, y) + case GB_INT32_code : e = 80 ; break ; // GB_pow_int32 (x, y) + case GB_INT64_code : e = 81 ; break ; // GB_pow_int64 (x, y) + case GB_UINT8_code : e = 82 ; break ; // GB_pow_uint8 (x, y) + case GB_UINT16_code : e = 83 ; break ; // GB_pow_uint16 (x, y) + case GB_UINT32_code : e = 84 ; break ; // GB_pow_uint32 (x, y) + case GB_UINT64_code : e = 85 ; break ; // GB_pow_uint64 (x, y) + case GB_FP32_code : e = 86 ; break ; // GB_powf (x, y) + case GB_FP64_code : e = 87 ; break ; // GB_pow (x, y) + case GB_FC32_code : e = 88 ; break ; // GB_cpowf (x, y) + case GB_FC64_code : e = 89 ; break ; // GB_cpow (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_ATAN2_opcode : // z = atan2 (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 90 ; break ; // atan2f (x, y) + case GB_FP64_code : e = 91 ; break ; // atan2 (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_HYPOT_opcode : // z = hypot (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 92 ; break ; // hypotf (x, y) + case GB_FP64_code : e = 93 ; break ; // hypot (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_FMOD_opcode : // z = fmod (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 94 ; break ; // fmodf (x, y) + case GB_FP64_code : e = 95 ; break ; // fmod (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_REMAINDER_opcode : // z = remainder (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 96 ; break ; // remainderf (x, y) + case GB_FP64_code : e = 97 ; break ; // remainder (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_COPYSIGN_opcode : // z = copysign (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 98 ; break ; // copysignf (x, y) + case GB_FP64_code : e = 99 ; break ; // copysign (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_LDEXP_opcode : // z = ldexp (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 100 ; break ; // ldexpf (x, y) + case GB_FP64_code : e = 101 ; break ; // ldexp (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_CMPLX_opcode : // z = cmplx (x,y) + + switch (zcode) + { + case GB_FP32_code : e = 102 ; break ; // GxB_CMPLXF (x, y) + case GB_FP64_code : e = 103 ; break ; // GxB_CMPLX (x, y) + default : e = -1 ; break ; + } + break ; + + case GB_PAIR_opcode : // z = 1 + + e = 104 ; break ; // 1 + + case GB_FIRSTI_opcode : // z = first_i(A(i,j),y) == i + + e = 105 ; break ; // z = i + + case GB_FIRSTI1_opcode : // z = first_i1(A(i,j),y) == i+1 + + e = 106 ; break ; // z = i+1 + + case GB_FIRSTJ_opcode : // z = first_j(A(i,j),y) == j + + e = 107 ; break ; // z = for_semiring ? (k) : (j) + + case GB_FIRSTJ1_opcode : // z = first_j1(A(i,j),y) == j+1 + + e = 108 ; break ; // z = for_semiring ? (k+1) : (j+1) + + case GB_SECONDI_opcode : // z = second_i(x,B(i,j)) == i + + e = 109 ; break ; // z = for_semiring ? (k) : (i) + + case GB_SECONDI1_opcode : // z = second_i1(x,B(i,j)) == i+1 + + e = 110 ; break ; // z = for_semiring ? (k) : () + + case GB_SECONDJ_opcode : // z = second_j(x,B(i,j)) == j + + e = 111 ; break ; // z = j + + case GB_SECONDJ1_opcode : // z = second_j1(x,B(i,j)) == j+1 + + e = 112 ; break ; // z = j+1 + + default : break ; + } + + (*ecode) = e ; +} + +void GB_cuda_charify_binop +( + // output: + char **op_string, // string defining the operator + // input: + int ecode, // from GB_cuda_enumify_binop + bool for_semiring // true for A*B, false for A+B or A.*B (not needed) +) +{ + const char *f ; + + switch (ecode) + { + + // first + case 0 : f = "(x)" ; break ; + + // any, second + case 1 : f = "(y)" ; break ; + + // min + case 2 : f = "fminf (x,y)" ; break ; + case 3 : f = "fmin (x,y)" ; break ; + case 4 : f = "GB_IMIN (x,y)" ; break ; + + // max + case 5 : f = "fmaxf (x,y)" ; break ; + case 6 : f = "fmax (x,y)" ; break ; + case 7 : f = "GB_IMAX (x,y)" ; break ; + + // plus + case 8 : f = "(x) + (y)" ; break ; + + // minus + case 9 : f = "(x) - (y)" ; break ; + + // rminus + case 10 : f = "(y) - (x)" ; break ; + + // times + case 11 : f = "(x) * (y)" ; break ; + + // div + case 12 : f = "GB_IDIV_SIGNED(x,y,8)" ; break ; + case 13 : f = "GB_IDIV_SIGNED(x,y,16)" ; break ; + case 14 : f = "GB_IDIV_SIGNED(x,y,32)" ; break ; + case 15 : f = "GB_IDIV_SIGNED(x,y,64)" ; break ; + case 16 : f = "GB_IDIV_UNSIGNED(x,y,8)" ; break ; + case 17 : f = "GB_IDIV_UNSIGNED(x,y,16)" ; break ; + case 18 : f = "GB_IDIV_UNSIGNED(x,y,32)" ; break ; + case 19 : f = "GB_IDIV_UNSIGNED(x,y,64)" ; break ; + case 20 : f = "(x) / (y)" ; break ; + + // rdiv + case 21 : f = "GB_IDIV_SIGNED(y,x,8)" ; break ; + case 22 : f = "GB_IDIV_SIGNED(y,x,16)" ; break ; + case 23 : f = "GB_IDIV_SIGNED(y,x,32)" ; break ; + case 24 : f = "GB_IDIV_SIGNED(y,x,64)" ; break ; + case 25 : f = "GB_IDIV_UNSIGNED(y,x,8)" ; break ; + case 26 : f = "GB_IDIV_UNSIGNED(y,x,16)" ; break ; + case 27 : f = "GB_IDIV_UNSIGNED(y,x,32)" ; break ; + case 28 : f = "GB_IDIV_UNSIGNED(y,x,64)" ; break ; + case 29 : f = "(y) / (x)" ; break ; + + // eq, iseq + case 30 : f = "(x) == (y)" ; break ; + + // ne, isne + case 31 : f = "(x) != (y)" ; break ; + + // gt, isgt + case 32 : f = "(x) > (y)" ; break ; + + // lt, islt + case 33 : f = "(x) < (y)" ; break ; + + // ge, isget + case 34 : f = "(x) >= (y)" ; break ; + + // le, isle + case 35 : f = "(x) <= (y)" ; break ; + + // lor + case 36 : f = "(x) || (y)" ; break ; + case 37 : f = "((x)!=0) || ((y)!=0)" ; break ; + + // land + case 38 : f = "(x) && (y)" ; break ; + case 39 : f = "((x)!=0) && ((y)!=0)" ; break ; + + // lxor + case 40 : f = "(x) != (y)" ; break ; + case 41 : f = "((x)!=0) != ((y)!=0)" ; break ; + + // bor + case 42 : f = "(x) | (y)" ; break ; + + // band + case 43 : f = "(x) & (y)" ; break ; + + // bxor + case 44 : f = "(x) ^ (y)" ; break ; + + // bxnor + case 45 : f = "~((x) ^ (y))" ; break ; + + // bget + case 46 : f = "GB_BITGET(x,y,int8_t, 8)" ; break ; + case 47 : f = "GB_BITGET(x,y,int16_t,16)" ; break ; + case 48 : f = "GB_BITGET(x,y,int32_t,32)" ; break ; + case 49 : f = "GB_BITGET(x,y,int64_t,64)" ; break ; + case 50 : f = "GB_BITGET(x,y,uint8_t,8)" ; break ; + case 51 : f = "GB_BITGET(x,y,uint16_t,16)" ; break ; + case 52 : f = "GB_BITGET(x,y,uint32_t,32)" ; break ; + case 53 : f = "GB_BITGET(x,y,uint64_t,64)" ; break ; + + // bset + case 54 : f = "GB_BITSET(x,y,int8_t, 8)" ; break ; + case 55 : f = "GB_BITSET(x,y,int16_t,16)" ; break ; + case 56 : f = "GB_BITSET(x,y,int32_t,32)" ; break ; + case 57 : f = "GB_BITSET(x,y,int64_t,64)" ; break ; + case 58 : f = "GB_BITSET(x,y,uint8_t,8)" ; break ; + case 59 : f = "GB_BITSET(x,y,uint16_t,16)" ; break ; + case 60 : f = "GB_BITSET(x,y,uint32_t,32)" ; break ; + case 61 : f = "GB_BITSET(x,y,uint64_t,64)" ; break ; + + // bclr + case 62 : f = "GB_BITCLR(x,y,int8_t, 8)" ; break ; + case 63 : f = "GB_BITCLR(x,y,int16_t,16)" ; break ; + case 64 : f = "GB_BITCLR(x,y,int32_t,32)" ; break ; + case 65 : f = "GB_BITCLR(x,y,int64_t,64)" ; break ; + case 66 : f = "GB_BITCLR(x,y,uint8_t,8)" ; break ; + case 67 : f = "GB_BITCLR(x,y,uint16_t,16)" ; break ; + case 68 : f = "GB_BITCLR(x,y,uint32_t,32)" ; break ; + case 69 : f = "GB_BITCLR(x,y,uint64_t,64)" ; break ; + + // bshift + case 70 : f = "GB_bitshift_int8(x,y)" ; break ; + case 71 : f = "GB_bitshift_int16(x,y)" ; break ; + case 72 : f = "GB_bitshift_int32(x,y)" ; break ; + case 73 : f = "GB_bitshift_int64(x,y)" ; break ; + case 74 : f = "GB_bitshift_uint8(x,y)" ; break ; + case 75 : f = "GB_bitshift_uint16(x,y)" ; break ; + case 76 : f = "GB_bitshift_uint32(x,y)" ; break ; + case 77 : f = "GB_bitshift_uint64(x,y)" ; break ; + + // pow + case 78 : f = "GB_pow_int8 (x, y)" ; break ; + case 79 : f = "GB_pow_int16 (x, y)" ; break ; + case 80 : f = "GB_pow_int32 (x, y)" ; break ; + case 81 : f = "GB_pow_int64 (x, y)" ; break ; + case 82 : f = "GB_pow_uint8 (x, y)" ; break ; + case 83 : f = "GB_pow_uint16 (x, y)" ; break ; + case 84 : f = "GB_pow_uint32 (x, y)" ; break ; + case 85 : f = "GB_pow_uint64 (x, y)" ; break ; + case 86 : f = "GB_powf (x, y)" ; break ; + case 87 : f = "GB_pow (x, y)" ; break ; + case 88 : f = "GB_cpowf (x, y)" ; break ; + case 89 : f = "GB_cpow (x, y)" ; break ; + + // atan2 + case 90 : f = "atan2f (x, y)" ; break ; + case 91 : f = "atan2 (x, y)" ; break ; + + // hypot + case 92 : f = "hypotf (x, y)" ; break ; + case 93 : f = "hypot (x, y)" ; break ; + + // fmod + case 94 : f = "fmodf (x, y)" ; break ; + case 95 : f = "fmod (x, y)" ; break ; + + // remainder + case 96 : f = "remainderf (x, y)" ; break ; + case 97 : f = "remainder (x, y)" ; break ; + + // copysign + case 98 : f = "copysignf (x, y)" ; break ; + case 99 : f = "copysign (x, y)" ; break ; + + // ldexp + case 100 : f = "ldexpf (x, y)" ; break ; + case 101 ; f = "ldexp (x, y)" ; break ; + + // cmplex + case 102 : f = "GxB_CMPLXF (x, y)" ; break ; + case 103 : f = "GxB_CMPLX (x, y)" ; break ; + + // pair + case 104 : f = "(1)" ; break ; + + // firsti + case 105 : f = "(i)" ; break ; + + // firsti1 + case 106 : f = "(i+1)" ; break ; + + // firstj + case 107 : f = for_semiring ? "(k)" : "(j)" ; break ; + + // firstj1 + case 108 : f = for_semiring ? "(k+1)" : "(j+1)" ; break ; + + // secondi + case 109 : f = for_semiring ? "(k)" : "(i)" ; break ; + + // secondi1 + case 110 : f = for_semiring ? "(k+1)" : "(i+1)" ; break ; + + // secondj + case 111 : f = "(j)" ; break ; + + // secondj1 + case 112 : f = "(j+1)" ; break ; + + default : f = NULL ; ; break ; + } + + (*op_string) = f ; +} + +void GB_cuda_macrofy_binop +( + // output: + char *code_string, // string with the #define macro + // input: + const char *macro_name, // name of macro to construct + char *op_string // string defining the operator +) +{ + snprintf (code_string, GB_CUDA_STRLEN, + "#define %s(x,y) (%s)", macro_name, op_string) ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_identity.c b/GraphBLAS/CUDA/GB_cuda_stringify_identity.c new file mode 100644 index 0000000000..c969c591fd --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_identity.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 +//------------------------------------------------------------------------------ +// GB_cuda_stringify_identity: return string for identity value +//------------------------------------------------------------------------------ + +#include "GB.h" +#include "GB_cuda_stringify.h" + +#define ID( x) IDENT = (x) + +void GB_cuda_stringify_identity // return string for identity value +( + // output: + char *code_string, // string with the #define macro + // input: + GB_Opcode opcode, // must be a built-in binary operator from a monoid + GB_Type_code zcode // type code used in the opcode we want +) +{ + const char *IDENT; + switch (opcode) + { + case GB_MIN_opcode : + + switch (zcode) + { + case GB_BOOL_code : ID ("true") ; // boolean AND + case GB_INT8_code : ID ("INT8_MAX") ; + case GB_INT16_code : ID ("INT16_MAX") ; + case GB_INT32_code : ID ("INT32_MAX") ; + case GB_INT64_code : ID ("INT64_MAX") ; + case GB_UINT8_code : ID ("UINT8_MAX") ; + case GB_UINT16_code : ID ("UINT16_MAX") ; + case GB_UINT32_code : ID ("UINT32_MAX") ; + case GB_UINT64_code : ID ("UINT64_MAX") ; + default : ID ("INFINITY") ; + } + break ; + + case GB_MAX_opcode : + + switch (zcode) + { + case GB_BOOL_code : ID ("false") ; // boolean OR + case GB_INT8_code : ID ("INT8_MIN") ; + case GB_INT16_code : ID ("INT16_MIN") ; + case GB_INT32_code : ID ("INT32_MIN") ; + case GB_INT64_code : ID ("INT64_MIN") ; + case GB_UINT8_code : ID ("0") ; + case GB_UINT16_code : ID ("0") ; + case GB_UINT32_code : ID ("0") ; + case GB_UINT64_code : ID ("0") ; + default : ID ("(-INFINITY)") ; + } + break ; + + case GB_PLUS_opcode : ID ("0") ; + case GB_TIMES_opcode : ID ("1") ; + case GB_LOR_opcode : ID ("false") ; + case GB_LAND_opcode : ID ("true") ; + case GB_LXOR_opcode : ID ("false") ; + // case GB_LXNOR_opcode : + case GB_EQ_opcode : ID ("true") ; + // case GB_ANY_opcode : + default : ID ("0") ; + } + snprintf (code_string, GB_CUDA_STRLEN, "#define GB_IDENTITY (%s)", IDENT) ; + +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_load.c b/GraphBLAS/CUDA/GB_cuda_stringify_load.c new file mode 100644 index 0000000000..cf0117db19 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_load.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Construct a macro to load and typecast. For example: +// +// #define GB_GETA(blob) blob +// +// then use as: +// GB_GETA (double aij = Ax [p]) ; +// GB_GETA (double *Ax = A->x) ; +// GB_GETA (T_A *restrict Ax = A->x) ; +// +// which become +// double aij = Ax [p] ; +// double *Ax = A->x ; +// T_A *Ax = A->x ; +// +// or, if is_pattern is true, the macro becomes the empty string. + +#include "GB.h" +#include "GB_cuda_stringify.h" + +void GB_cuda_stringify_load // return a string to load/typecast macro +( + // output: + char *result, + // input: + const char *macro_name, // name of macro to construct + bool is_pattern // if true, load/cast does nothing +) +{ + + if (is_pattern) + { + snprintf (result, GB_CUDA_STRLEN, "#define %s(blob)", macro_name) ; + } + else + { + snprintf (result, GB_CUDA_STRLEN, "#define %s(blob) blob", macro_name) ; + } +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_mask.c b/GraphBLAS/CUDA/GB_cuda_stringify_mask.c new file mode 100644 index 0000000000..3366328c99 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_mask.c @@ -0,0 +1,65 @@ +//SPDX-License-Identifier: Apache-2.0 + +//#include "GB_cuda.h" +#include "GB.h" +#include "GB_cuda_stringify.h" + +const char *GB_cuda_stringify_mask +( + const GB_Type_code M_type_code, + bool mask_is_structural +) +{ + + if (mask_is_structural) + { + return ( + "#define GB_MTYPE void\n" + "#define MX(i) true") ; + } + else + { + switch (M_type_code) + { + case GB_BOOL_code: + case GB_INT8_code: + case GB_UINT8_code: + return ( + "#define GB_MTYPE uint8_t\n" + "#define MX(i) Mx [i]") ; + + case GB_INT16_code: + case GB_UINT16_code: + return ( + "#define GB_MTYPE uint16_t\n" + "#define MX(i) Mx [i]") ; + + case GB_INT32_code: + case GB_UINT32_code: +// case GB_FC32_code: + case GB_FP32_code: + return ( + "#define GB_MTYPE uint32_t\n" + "#define MX(i) Mx [i]") ; + + case GB_INT64_code: + case GB_UINT64_code: +// case GB_FC64_code: + case GB_FP64_code: + return ( + "#define GB_MTYPE uint64_t\n" + "#define MX(i) Mx [i]") ; + +// case GB_FC64_code: +// return ( +// "#define GB_MTYPE double complex\n" +// "#define MX(i) Mx [i]") ; + + default: ; + } + } + + // unrecognized type + return (NULL) ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_opcode.c b/GraphBLAS/CUDA/GB_cuda_stringify_opcode.c new file mode 100644 index 0000000000..c54b0c4c91 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_opcode.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "GB.h" +#include "GB_cuda_stringify.h" + +const char *GB_cuda_stringify_opcode +( + GB_Opcode opcode // opcode of GraphBLAS operator +) +{ + + switch (opcode) + { + case GB_FIRST_opcode : return ("1st") ; + // case GB_ANY_opcode : return ("any") ; + case GB_SECOND_opcode : return ("2nd") ; + case GB_MIN_opcode : return ("min") ; + case GB_MAX_opcode : return ("max") ; + case GB_PLUS_opcode : return ("plus") ; + case GB_MINUS_opcode : return ("minus") ; + case GB_RMINUS_opcode : return ("rminus") ; + case GB_TIMES_opcode : return ("times") ; + case GB_DIV_opcode : return ("div") ; + case GB_RDIV_opcode : return ("rdiv") ; + case GB_EQ_opcode : return ("eq") ; + case GB_ISEQ_opcode : return ("iseq") ; + case GB_NE_opcode : return ("ne") ; + case GB_ISNE_opcode : return ("isne") ; + case GB_GT_opcode : return ("gt") ; + case GB_ISGT_opcode : return ("isgt") ; + case GB_LT_opcode : return ("lt") ; + case GB_ISLT_opcode : return ("islt") ; + case GB_GE_opcode : return ("ge") ; + case GB_ISGE_opcode : return ("isge") ; + case GB_LE_opcode : return ("le") ; + case GB_ISLE_opcode : return ("isle") ; + case GB_LOR_opcode : return ("lor") ; + case GB_LAND_opcode : return ("land") ; + case GB_LXOR_opcode : return ("lxor") ; + // case GB_BOR_opcode : ... bitwise ops + // x | y, etc + // case GB_PAIR_opcode : + default : ; + } + + return ("") ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_semiring.c b/GraphBLAS/CUDA/GB_cuda_stringify_semiring.c new file mode 100644 index 0000000000..4dfd292f3a --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_semiring.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Construct a string defining a semiring. +// User-defined types are not handled. + +#include "GB.h" +#include "GB_cuda_stringify.h" + +void GB_cuda_stringify_semiring // build a semiring (name and code) +( + // input: + GrB_Semiring semiring, // the semiring to stringify + bool flipxy, // multiplier is: mult(a,b) or mult(b,a) + GrB_Type ctype, // the type of C + GrB_Type atype, // the type of A + GrB_Type btype, // the type of B + GrB_Type mtype, // the type of M, or NULL if no mask + bool Mask_struct, // mask is structural + bool mask_in_semiring_name, // if true, then the semiring_name includes + // the mask_name. If false, then semiring_name + // is independent of the mask_name + // output: (all of size at least GB_CUDA_LEN+1) + char *semiring_name, // name of the semiring + char *semiring_code, // List of types and macro defs + char *mask_name // definition of mask data load +) +{ + + // check inputs + ASSERT (semiring->object_kind == GB_BUILTIN) ; + + // get the semiring + GrB_Monoid add = semiring->add ; + GrB_BinaryOp mult = semiring->multiply ; + GrB_BinaryOp addop = add->op ; + GrB_Type xtype = mult->xtype ; + GrB_Type ytype = mult->ytype ; + GrB_Type ztype = mult->ztype ; + GB_Opcode mult_opcode = mult->opcode ; + GB_Opcode add_opcode = addop->opcode ; + GB_Type_code xcode = xtype->code ; + GB_Type_code ycode = ytype->code ; + GB_Type_code zcode = ztype->code ; + + // these must always be true for any semiring: + ASSERT (mult->ztype == addop->ztype) ; + ASSERT (addop->xtype == addop->ztype && addop->ytype == addop->ztype) ; + + // for now, this is true for all built-in binops: + ASSERT (xcode == ycode) ; + + //-------------------------------------------------------------------------- + // rename redundant boolean operators + //-------------------------------------------------------------------------- + + // consider z = op(x,y) where both x and y are boolean: + // DIV becomes FIRST + // RDIV becomes SECOND + // MIN and TIMES become LAND + // MAX and PLUS become LOR + // NE, ISNE, RMINUS, and MINUS become LXOR + // ISEQ becomes EQ + // ISGT becomes GT + // ISLT becomes LT + // ISGE becomes GE + // ISLE becomes LE + + if (zcode == GB_BOOL_code) + { + // rename the monoid + add_opcode = GB_boolean_rename (add_opcode) ; + } + + if (xcode == GB_BOOL_code) // && (ycode == GB_BOOL_code) + { + // rename the multiplicative operator + mult_opcode = GB_boolean_rename (mult_opcode) ; + } + + //-------------------------------------------------------------------------- + // handle the flip + //-------------------------------------------------------------------------- + + if (flipxy) + { + // z = fmult (b,a) will be computed: handle this by renaming the + // multiplicative operator + + // handle the flip + mult_opcode = GB_binop_flip (mult_opcode) ; + + // the flip is now handled completely. This assumes xtype and ytype + // are the same for all built-in operators. If this changes, the + // types will have to be flipped too. + flipxy = false ; + } + + //-------------------------------------------------------------------------- + // determine if A and/or B are value-agnostic + //-------------------------------------------------------------------------- + + bool op_is_first = (mult_opcode == GB_FIRST_opcode ) ; + bool op_is_second = (mult_opcode == GB_SECOND_opcode) ; + bool op_is_pair = false ; // (mult_opcode == GB_PAIR_opcode) ; + bool A_is_pattern = op_is_second || op_is_pair ; + bool B_is_pattern = op_is_first || op_is_pair ; + + //-------------------------------------------------------------------------- + // construct macros to load scalars from A and B (and typecast) them + //-------------------------------------------------------------------------- + + char acast [GB_CUDA_STRLEN+1] ; + char bcast [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_load (acast, "GB_GETA", A_is_pattern) ; + GB_cuda_stringify_load (bcast, "GB_GETB", B_is_pattern) ; + + //-------------------------------------------------------------------------- + // construct macros for the multiply + //-------------------------------------------------------------------------- + + char mult_function [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_binop (mult_function, "GB_MULT", mult_opcode, zcode) ; + + //-------------------------------------------------------------------------- + // construct the monoid macros + //-------------------------------------------------------------------------- + + char add_function [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_binop (add_function, "GB_ADD", add_opcode, zcode) ; + + char identity_definition [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_identity ( identity_definition, add_opcode, zcode) ; + + bool is_terminal ; + char terminal_condition [GB_CUDA_STRLEN+1] ; + char terminal_statement [GB_CUDA_STRLEN+1] ; + + GB_cuda_stringify_terminal ( + &is_terminal, terminal_condition, terminal_statement, + "GB_TERMINAL_CONDITION", "GB_IF_TERMINAL_BREAK", add_opcode, zcode) ; + + //-------------------------------------------------------------------------- + // macro to typecast the result back into C + //-------------------------------------------------------------------------- + + // for the ANY_PAIR semiring, "c_is_one" will be true, and Cx [0..cnz] will + // be filled with all 1's later. + bool c_is_one = false ; + // TODO: + // (add_opcode == GB_ANY_opcode && mult_opcode == GB_PAIR_opcode) ; + char ccast [GB_CUDA_STRLEN+1] ; + GB_cuda_stringify_load (ccast, "GB_PUTC", c_is_one) ; + + //-------------------------------------------------------------------------- + // construct the macros to access the mask (if any), and its name + //-------------------------------------------------------------------------- + + const char *mask_string = "" ; + const char *mask_type_name = "" ; + const char *struct_str = "struct"; + if (mtype != NULL) + { + mask_string = GB_cuda_stringify_mask (mtype->code, Mask_struct) ; + mask_type_name = mtype->name ; + } + else + { + mask_type_name = struct_str; + } + + snprintf (mask_name, GB_CUDA_STRLEN, "mask_%s", mask_type_name) ; + + //-------------------------------------------------------------------------- + // build the final semiring code + //-------------------------------------------------------------------------- + + snprintf (semiring_code, GB_CUDA_STRLEN, + "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n", + acast, bcast, mult_function, add_function, identity_definition, + terminal_condition, terminal_statement, ccast, mask_string) ; + + //-------------------------------------------------------------------------- + // build the final semiring name + //-------------------------------------------------------------------------- + + // the semiring_name depends on: + // add_opcode + // mult_opcode + // ztype->name + // xtype->name (currently, always == ytype->name, but will change (TODO)) + // ytype->name + // ctype->name + // mask_type_name (but only if mask_in_semiring_name is true) + // atype->name + // btype->name + + const char *add_name; + const char *mult_name; + + add_name = GB_cuda_stringify_opcode (add_opcode) ; + mult_name = GB_cuda_stringify_opcode (mult_opcode) ; + +// these are not needed: they are template parameters to the CUDA kernel: +// ztype->name, xtype->name, ytype->name, +// ctype->name, atype->name, btype->name + +// ztype->name is required, since the kernel needs it for the identity +// value. xtype->name is not strictly required. However, the GraphBLAS +// naming scheme is add_mult_xtype, so it is included here. The ytype +// and ztype need not be xtype. + + if (mask_in_semiring_name) + { + + // the format of the semiring name is: + // + // semiring_add_mult_xtype_M_mtype_Z_ztype + + snprintf (semiring_name, GB_CUDA_STRLEN, + "semiring_%s_%s_%s_M_%s_Z_%s", + // The first part is akin to GxB_PLUS_TIMES_FP64 (for example), + // but here this example is semiring_plus_times_double instead: + add_name, mult_name, xtype->name, + // these are not in the GrB* or GxB* name, but are needed by CUDA: + // mask_type_name is (say) 'int64' or 'bool'. + // ztype is the name of the monoid type. + mask_type_name, ztype->name) ; + + } + else + { + + // the format of the semiring name is: + // + // semiring_add_mult_xtype_Z_ztype + + snprintf (semiring_name, GB_CUDA_STRLEN, + "semiring_%s_%s_%s_Z_%s", + // The first part is akin to GxB_PLUS_TIMES_FP64 (for example), + // but here this example is semiring_plus_times_double instead: + add_name, mult_name, xtype->name, + // this is not in the GrB* or GxB* name, but is needed by CUDA: + // ztype is the name of the monoid type. + ztype->name) ; + + } + + printf ("semiring_name:\n%s\n", semiring_name) ; + //printf ("semiring_code:\n%s\n", semiring_code) ; + //printf ("mask_name: \n%s\n", mask_name) ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_terminal.c b/GraphBLAS/CUDA/GB_cuda_stringify_terminal.c new file mode 100644 index 0000000000..26773d0383 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_stringify_terminal.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: Apache-2.0 +//------------------------------------------------------------------------------ +// GB_cuda_stringify_terminal: string to check terminal condition +//------------------------------------------------------------------------------ + +// The macro_condition_name(cij) should return true if the value of cij has +// reached its terminal value, or false otherwise. If the monoid is not +// terminal, then the macro should always return false. The ANY monoid +// should always return true. + +// The macro_statement_name is a macro containing a full statement. If the +// monoid is never terminal, it becomes the empty statement (";"). Otherwise, +// it checks the terminal condition and does a "break" if true. + +#include "GB.h" +#include "GB_cuda_stringify.h" + +void GB_cuda_stringify_terminal // return strings to check terminal +( + // outputs: + bool *is_monoid_terminal, + char *terminal_condition, + char *terminal_statement, + // inputs: + const char *macro_condition_name, + const char *macro_statement_name, + GB_Opcode opcode, // must be a built-in binary operator from a monoid + GB_Type_code zcode // op->ztype->code +) +{ + + //-------------------------------------------------------------------------- + // determine if the monoid is terminal, and find its terminal value + //-------------------------------------------------------------------------- + + bool is_terminal = false ; + const char *f = NULL ; + + switch (opcode) + { + + #if 0 + case GB_ANY_opcode : + f = NULL ; + is_terminal = true ; + break ; + #endif + + case GB_MIN_opcode : + + is_terminal = true ; + switch (zcode) + { + case GB_BOOL_code : f = "false" ; break ; + case GB_INT8_code : f = "INT8_MIN" ; break ; + case GB_INT16_code : f = "INT16_MIN" ; break ; + case GB_INT32_code : f = "INT32_MIN" ; break ; + case GB_INT64_code : f = "INT64_MIN" ; break ; + case GB_UINT8_code : f = "0" ; break ; + case GB_UINT16_code : f = "0" ; break ; + case GB_UINT32_code : f = "0" ; break ; + case GB_UINT64_code : f = "0" ; break ; + default : f = "(-INFINITY)" ; break ; + } + break ; + + case GB_MAX_opcode : + + is_terminal = true ; + switch (zcode) + { + case GB_BOOL_code : f = "true" ; break ; + case GB_INT8_code : f = "INT8_MAX" ; break ; + case GB_INT16_code : f = "INT16_MAX" ; break ; + case GB_INT32_code : f = "INT32_MAX" ; break ; + case GB_INT64_code : f = "INT64_MAX" ; break ; + case GB_UINT8_code : f = "UINT8_MAX" ; break ; + case GB_UINT16_code : f = "UINT16_MAX" ; break ; + case GB_UINT32_code : f = "UINT32_MAX" ; break ; + case GB_UINT64_code : f = "UINT64_MAX" ; break ; + default : f = "INFINITY" ; break ; + } + break ; + + case GB_PLUS_opcode : + + if (zcode == GB_BOOL_code) + { + f = "true" ; // boolean OR + is_terminal = true ; + } + else + { + f = NULL ; + is_terminal = false ; + } + break ; + + case GB_TIMES_opcode : + + switch (zcode) + { + case GB_BOOL_code : // boolean AND + case GB_INT8_code : + case GB_INT16_code : + case GB_INT32_code : + case GB_INT64_code : + case GB_UINT8_code : + case GB_UINT16_code : + case GB_UINT32_code : + case GB_UINT64_code : + f = "0" ; + is_terminal = true ; + break ; + default : + f = NULL ; + is_terminal = false ; + break ; + } + break ; + + case GB_LOR_opcode : f = "true" ; is_terminal = true ; break ; + case GB_LAND_opcode : f = "false" ; is_terminal = true ; break ; + + case GB_LXOR_opcode : + // case GB_LXNOR_opcode : + case GB_EQ_opcode : + default : + // the monoid is not terminal + f = NULL ; + is_terminal = false ; + break ; + } + + //-------------------------------------------------------------------------- + // construct the macro to test the terminal condition + //-------------------------------------------------------------------------- + + if (is_terminal) + { + // the monoid is terminal + if (f == NULL) + { + // ANY monoid + snprintf (terminal_condition, GB_CUDA_STRLEN, + "#define %s(cij) true", macro_condition_name) ; + snprintf (terminal_statement, GB_CUDA_STRLEN, + "#define %s break", macro_statement_name) ; + } + else + { + // typical terminal monoids: check if C(i,j) has reached its + // terminal value + snprintf (terminal_condition, GB_CUDA_STRLEN, + "#define %s(cij) ((cij) == %s)", macro_condition_name, f) ; + snprintf (terminal_statement, GB_CUDA_STRLEN, + "#define %s if (%s (cij)) break", + macro_statement_name, macro_condition_name) ; + } + } + else + { + // the monoid is not terminal: the condition is always false + snprintf (terminal_condition, GB_CUDA_STRLEN, "#define %s(cij) false", + macro_condition_name) ; + snprintf (terminal_statement, GB_CUDA_STRLEN, "#define %s", + macro_statement_name) ; + } + + (*is_monoid_terminal) = is_terminal ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_type_bits.c b/GraphBLAS/CUDA/GB_cuda_type_bits.c new file mode 100644 index 0000000000..8712f1698d --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_type_bits.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "GB.h" + +size_t GB_cuda_type_bits (GB_Type_code); + +size_t GB_cuda_type_bits (GB_Type_code type_code) +{ + switch (type_code) + { + case GB_BOOL_code : return (8) ; + case GB_INT8_code : return (8) ; + case GB_INT16_code : return (16) ; + case GB_INT32_code : return (32) ; + case GB_INT64_code : return (64) ; + case GB_UINT8_code : return (8) ; + case GB_UINT16_code : return (16) ; + case GB_UINT32_code : return (32) ; + case GB_UINT64_code : return (64) ; + case GB_FP32_code : return (32) ; + case GB_FP64_code : return (64) ; +// case GB_FC32_code : return (64) ; +// case GB_FC64_code : return (128) ; + default : return (0) ; + } +} + diff --git a/GraphBLAS/CUDA/GB_cuda_warmup.cu b/GraphBLAS/CUDA/GB_cuda_warmup.cu new file mode 100644 index 0000000000..6a0283bec3 --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_warmup.cu @@ -0,0 +1,74 @@ +//------------------------------------------------------------------------------ +// GB_cuda_warmup.cu: warmup the GPU +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" +/* +#include "rmm/include/rmm/mr/device/managed_memory_resource.hpp" +#include "rmm/include/rmm/mr/device/pool_memory_resource.hpp" +#include "rmm/include/rmm/mr/device/owning_wrapper.hpp" +#include "rmm/include/rmm/mr/device/default_memory_resource.hpp" +#include "rmm/include/rmm/mr/device/per_device_resource.hpp" +#include "rmm/include/rmm/mr/device/cnmem_managed_memory_resource.hpp" +*/ +#include "rmm/detail/cnmem.h" + +bool GB_cuda_warmup (int device) +{ + // allocate 'nothing' just to load the drivers. + // No need to free the result. + double gpu_memory_size = GB_Global_gpu_memorysize_get (device); + + printf ("warming up device %d memsize %g sms %d\n", + device, + gpu_memory_size, + GB_Global_gpu_sm_get (device)) ; + + + //auto cuda_managed = std::make_shared(); + //auto cuda = std::make_shared(); + //auto pool = rmm::mr::make_owning_wrapper + // ( cuda_managed, gpu_memory_size/2, gpu_memory_size ) ; + + std::vector dev{0}; + cnmemDevice_t cnmem_device; + memset(&cnmem_device, 0, sizeof(cnmem_device) ) ; + cnmem_device.size = gpu_memory_size/2; + if( device ==0) + { + cnmemInit(1, &cnmem_device, CNMEM_FLAGS_MANAGED); + } + + //auto pool = std::make_shared ( gpu_memory_size/2 ) ; + + + //rmm::mr::set_per_device_resource ( rmm::cuda_device_id{device}, + // ( rmm::mr::device_memory_resource *)pool.get() ) ; + + //rmm::mr::set_default_resource ( pool.get() ); + //rmm::mr::set_current_device_resource ( pool.get() ); + + //GB_Global_gpu_device_memory_resource_set( device, (void *)rmm::mr::get_current_device_resource() ); + + void *p ; + //cudaError_t err = cudaMalloc (&p, (size_t) 0) ; + //p = rmm::mr::get_current_device_resource()->allocate( 256) ; + //p = pool->allocate( 10) ; + cnmemMalloc( &p, 256 , NULL); + //rmm::mr::get_current_device_resource()->deallocate(p, 1); + //pool->deallocate( p, 10); + cnmemFree( p, NULL); + + printf ("GPU %d nice and toasty now, pool=%g\n", device, gpu_memory_size/2 ) ; + + // TODO check for jit cache? or in GB_init? + + return true; //(err == cudaSuccess) ; +} + diff --git a/GraphBLAS/CUDA/GB_jit_cache.cu b/GraphBLAS/CUDA/GB_jit_cache.cu new file mode 100644 index 0000000000..9df0889865 --- /dev/null +++ b/GraphBLAS/CUDA/GB_jit_cache.cu @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2019,2020 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "GB_jit_cache.h" + +namespace jit { + + +// Get the directory in home to use for storing the cache +std::string get_user_home_cache_dir() { + auto home_dir = std::getenv("HOME"); + if (home_dir != nullptr) { + return std::string(home_dir) + "/.GraphBLAS/"; + } else { + return std::string(); + } +} + +// Default `GRAPHBLAS_CACHE_PATH` to `$HOME/.GraphBLAS`. +// This definition can be overridden at compile time by specifying a +// `-DGRAPHBLAS_CACHE_PATH=/kernel/cache/path` CMake argument. +// This path is used in the `getCacheDir()` function below. +#if !defined(GRAPHBLAS_CACHE_PATH) +#define GRAPHBLAS_CACHE_PATH get_user_home_cache_dir() +#endif + +/** + * @brief Get the string path to the JITIFY kernel cache directory. + * + * This path can be overridden at runtime by defining an environment variable + * named `GRAPHBLAS_CACHE_PATH`. The value of this variable must be a path + * under which the process' user has read/write priveleges. + * + * This function returns a path to the cache directory, creating it if it + * doesn't exist. + * + * The default cache directory is `$HOME/.GraphBLAS`. If no overrides + * are used and if $HOME is not defined, returns an empty path and file + * caching is not used. + **/ +std::string getCacheDir() { + // The environment variable always overrides the + // default/compile-time value of `GRAPHBLAS_CACHE_PATH` + auto kernel_cache_path_env = std::getenv("GRAPHBLAS_CACHE_PATH"); + auto kernel_cache_path = (kernel_cache_path_env != nullptr ? kernel_cache_path_env + : GRAPHBLAS_CACHE_PATH); + + struct stat st; + if ( (stat( kernel_cache_path.c_str(), &st) != 0) ) { + // `mkdir -p` the kernel cache path if it doesn't exist + printf("cache is going to path %s\n", kernel_cache_path.c_str()); + int status; + status = mkdir(kernel_cache_path.c_str(), 0777); + if (status != 0 ) return std::string(); + //boost::filesystem::create_directories(kernel_cache_path); + } + return std::string(kernel_cache_path); +} + +GBJitCache::GBJitCache() { } + +GBJitCache::~GBJitCache() { } + +std::mutex GBJitCache::_kernel_cache_mutex; +std::mutex GBJitCache::_program_cache_mutex; + +named_prog GBJitCache::getProgram( + std::string const& prog_name, + std::string const& cuda_source, + std::vector const& given_headers, + std::vector const& given_options, + jitify::experimental::file_callback_type file_callback) +{ + // Lock for thread safety + std::lock_guard lock(_program_cache_mutex); + //printf(" jit_cache get program %s\n", prog_name.c_str()); + + return getCached(prog_name, program_map, + [&](){ + return jitify::experimental::Program(cuda_source, + given_headers, + given_options, + file_callback); + } + ); +} + +named_prog GBJitCache::getKernelInstantiation( + std::string const& kern_name, + named_prog const& named_program, + std::vector const& arguments) +{ + // Lock for thread safety + std::lock_guard lock(_kernel_cache_mutex); + + std::string prog_name = std::get<0>(named_program); + jitify::experimental::Program& program = *std::get<1>(named_program); + + // Make instance name e.g. "prog_binop.kernel_v_v_int_int_long int_Add" + std::string kern_inst_name = prog_name + '.' + kern_name; + for ( auto&& arg : arguments ) kern_inst_name += '_' + arg; + + //printf(" got kernel instance %s\n",kern_inst_name.c_str()); + + return getCached(kern_inst_name, kernel_inst_map, + [&](){return program.kernel(kern_name) + .instantiate(arguments); + } + ); +} + +// Another overload for getKernelInstantiation which might be useful to get +// kernel instantiations in one step +// ------------------------------------------------------------------------ +/* +jitify::experimental::KernelInstantiation GBJitCache::getKernelInstantiation( + std::string const& kern_name, + std::string const& prog_name, + std::string const& cuda_source = "", + std::vector const& given_headers = {}, + std::vector const& given_options = {}, + file_callback_type file_callback = nullptr) +{ + auto program = getProgram(prog_name, + cuda_source, + given_headers, + given_options, + file_callback); + return getKernelInstantiation(kern_name, program); +} +*/ + +GBJitCache::cacheFile::cacheFile(std::string file_name) + : _file_name{file_name} +{ } + +GBJitCache::cacheFile::~cacheFile() { } + +std::string GBJitCache::cacheFile::read() +{ + // Open file (duh) + int fd = open ( _file_name.c_str(), O_RDWR ); + if ( fd == -1 ) { + // TODO: connect errors to GrB_error result + //printf(" failed to open cache file %s\n",_file_name.c_str()); + successful_read = false; + return std::string(); + } + + // Lock the file descriptor. we the only ones now + if ( lockf(fd, F_LOCK, 0) == -1 ) { + successful_read = false; + return std::string(); + } + + // Get file descriptor from file pointer + FILE *fp = fdopen( fd, "rb" ); + + // Get file length + fseek( fp , 0L , SEEK_END); + size_t file_size = ftell( fp ); + rewind( fp ); + + // Allocate memory of file length size + std::string content; + content.resize(file_size); + char *buffer = &content[0]; + + // Copy file into buffer + if( fread(buffer, file_size, 1, fp) != 1 ) { + //printf(" failed to read cache file %s\n",_file_name.c_str()); + successful_read = false; + fclose(fp); + free(buffer); + return std::string(); + } + fclose(fp); + successful_read = true; + printf(" read cache file %s\n",_file_name.c_str()); + + return content; +} + +void GBJitCache::cacheFile::write(std::string content) +{ + // Open file and create if it doesn't exist, with access 0600 + int fd = open ( _file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ); + if ( fd == -1 ) { + printf(" failed to open cache file for write %s\n",_file_name.c_str()); + successful_write = false; + return; + } + + // Lock the file descriptor. we the only ones now + if ( lockf(fd, F_LOCK, 0) == -1 ) { + successful_write = false; + return; + } + + // Get file descriptor from file pointer + FILE *fp = fdopen( fd, "wb" ); + + // Copy string into file + if( fwrite(content.c_str(), content.length(), 1, fp) != 1 ) { + printf(" failed to write cache file %s\n",_file_name.c_str()); + successful_write = false; + fclose(fp); + return; + } + fclose(fp); + + successful_write = true; + //printf(" wrote cache file %s\n",_file_name.c_str()); + + return; +} + +} // namespace jit diff --git a/GraphBLAS/CUDA/GB_jit_cache.h b/GraphBLAS/CUDA/GB_jit_cache.h new file mode 100644 index 0000000000..0564c58f73 --- /dev/null +++ b/GraphBLAS/CUDA/GB_jit_cache.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2019,2020 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GB_JIT_CACHE_H_ +#define GB_JIT_CACHE_H_ + +#include +#include +#include +#include +#include +#include +#include + + +#define JITIFY_USE_CACHE 1 + +namespace jit { + +template +using named_prog = std::pair>; + +/** + * @brief Get the string path to the JITIFY kernel cache directory. + * + * This path can be overridden at runtime by defining an environment variable + * named `GB_CUDA_KERNEL_CACHE_PATH`. The value of this variable must be a path + * under which the process' user has read/write priveleges. + * + * This function returns a path to the cache directory, creating it if it + * doesn't exist. + * + * The default cache directory `~/.GraphBLAS_kernel_cache`. + **/ + +std::string getCacheDir(); + +class GBJitCache +{ +public: + + /**---------------------------------------------------------------------------* + * @brief Get a process wide singleton cache object + * + *---------------------------------------------------------------------------**/ + static GBJitCache& Instance() { + // Meyers' singleton is thread safe in C++11 + // Link: https://stackoverflow.com/a/1661564 + static GBJitCache cache; + return cache; + } + + GBJitCache(); + ~GBJitCache(); + + /**---------------------------------------------------------------------------* + * @brief Get the Kernel Instantiation object + * + * Searches an internal in-memory cache and file based cache for the kernel + * and if not found, JIT compiles and returns the kernel + * + * @param kern_name [in] name of kernel to return + * @param program [in] Jitify preprocessed program to get the kernel from + * @param arguments [in] template arguments for kernel in vector of strings + * @return Pair of string kernel identifier and compiled kernel object + *---------------------------------------------------------------------------**/ + named_prog getKernelInstantiation( + std::string const& kern_name, + named_prog const& program, + std::vector const& arguments); + + /**---------------------------------------------------------------------------* + * @brief Get the Jitify preprocessed Program object + * + * Searches an internal in-memory cache and file based cache for the Jitify + * pre-processed program and if not found, JIT processes and returns it + * + * @param prog_file_name [in] name of program to return + * @param cuda_source [in] string source code of program to compile + * @param given_headers [in] vector of strings representing source or names of + * each header included in cuda_source + * @param given_options [in] vector of strings options to pass to NVRTC + * @param file_callback [in] pointer to callback function to call whenever a + * header needs to be loaded + * @return named_prog + *---------------------------------------------------------------------------**/ + named_prog getProgram( + std::string const& prog_file_name, + std::string const& cuda_source = "", + std::vector const& given_headers = {}, + std::vector const& given_options = {}, + jitify::experimental::file_callback_type file_callback = nullptr); + +private: + template + using umap_str_shptr = std::unordered_map>; + + umap_str_shptr kernel_inst_map; + umap_str_shptr program_map; + + /* + Even though this class can be used as a non-singleton, the file cache + access should remain limited to one thread per process. The lockf locks can + prevent multiple processes from accessing the file but are ineffective in + preventing multiple threads from doing so as the lock is shared by the + entire process. + Therefore the mutexes are static. + */ + static std::mutex _kernel_cache_mutex; + static std::mutex _program_cache_mutex; + +private: + /**---------------------------------------------------------------------------* + * @brief Class to allow process wise exclusive access to cache files + * + *---------------------------------------------------------------------------**/ + class cacheFile + { + private: + std::string _file_name ; + std::string _dir_name = "~/.GraphBLAS_kernel_cache/"; + bool successful_read = false; + bool successful_write = false; + public: + cacheFile(std::string file_name); + ~cacheFile(); + + /**---------------------------------------------------------------------------* + * @brief Read this file and return the contents as a std::string + * + *---------------------------------------------------------------------------**/ + std::string read(); + + /**---------------------------------------------------------------------------* + * @brief Write the passed string to this file + * + *---------------------------------------------------------------------------**/ + void write(std::string); + + /**---------------------------------------------------------------------------* + * @brief Check whether the read() operation on the file completed successfully + * + * @return true Read was successful. String returned by `read()` is valid + * @return false Read was unsuccessful. String returned by `read()` is empty + *---------------------------------------------------------------------------**/ + bool is_read_successful() { return successful_read; } + + /**---------------------------------------------------------------------------* + * @brief Check whether the write() operation on the file completed successfully + * + * @return true Write was successful. + * @return false Write was unsuccessful. File state is undefined + *---------------------------------------------------------------------------**/ + bool is_write_successful() { return successful_write; } + }; + +private: + template + named_prog getCached( + std::string const& name, + umap_str_shptr& map, + FallbackFunc func) { + + // Find memory cached T object + auto it = map.find(name); + if ( it != map.end()) { + std::cout<<"found memory-cached prog "<second); + } + else { // Find file cached T object + bool successful_read = false; + std::string serialized; + #if defined(JITIFY_USE_CACHE) + std::string cache_dir = getCacheDir(); + if (not cache_dir.empty() ) { + std::string file_name = cache_dir + name; + //std::cout<<"looking for prog in file "<(T::deserialize(serialized)); + map[name] = program; + //std::cout<<"storing prog in memory "< + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace jit { + + launcher::launcher( + const std::string& hash, + const std::string& cuda_source, + const std::vector& header_names, + const std::vector& compiler_flags, + jitify::experimental::file_callback_type file_callback, + cudaStream_t stream + ) + : cache_instance{jit::GBJitCache::Instance()} + , stream(stream) + { + program = cache_instance.getProgram( + hash, + cuda_source.c_str(), + header_names, + compiler_flags, + file_callback + ); + } + + launcher::launcher(launcher&& launcher) + : program {std::move(launcher.program)} + , cache_instance {jit::GBJitCache::Instance()} + , kernel_inst {std::move(launcher.kernel_inst)} + , stream {launcher.stream} + { } + +} // namespace jit diff --git a/GraphBLAS/CUDA/GB_jit_launcher.h b/GraphBLAS/CUDA/GB_jit_launcher.h new file mode 100644 index 0000000000..c01b385fcc --- /dev/null +++ b/GraphBLAS/CUDA/GB_jit_launcher.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019,2020 NVIDIA CORPORATION. + * + * Copyright 2018-2019 BlazingDB, Inc. + * Copyright 2018 Christian Noboa Mardini + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GB_JIT_LAUNCHER_H +#define GB_JIT_LAUNCHER_H + +#include +#include +#include +#include +#include + +#define JITIFY_PRINT_INSTANTIATION 0 +#define JITIFY_PRINT_SOURCE 1 +#define JITIFY_PRINT_LOG 1 +#define JITIFY_PRINT_PTX 1 +#define JITIFY_PRINT_LINKER_LOG 0 +#define JITIFY_PRINT_LAUNCH 1 +#include + +const std::vector compiler_flags{ + "-std=c++14", + "--use_fast_math", + "-remove-unused-globals", + "-w", + "-D__CUDACC_RTC__", + "-I.", + "-I..", + "-I../../Include", + "-I../../Source", + "-I../../Source/Template", + "-Ilocal_cub/block", + "-Itemplates", + "-I/usr/local/cuda/include" +}; + +namespace jit { + +/** + * @brief Class used to handle compilation and execution of JIT kernels + * + */ +class launcher { + public: + launcher() = delete; + + /** + * @brief C'tor of the launcher class + * + * Method to generate vector containing all template types for a JIT kernel. + * This vector is used to get the compiled kernel for one set of types and set + * it as the kernel to launch using this launcher. + * + * @param hash The hash to be used as the key for caching + * @param cuda_code The CUDA code that contains the kernel to be launched + * @param header_names Strings of header_names or strings that contain content + * of the header files + * @param compiler_flags Strings of compiler flags + * @param file_callback a function that returns header file contents given header + * file names. + * @param stream The non-owned stream to use for execution + */ + launcher( + const std::string& hash, + const std::string& cuda_source, + const std::vector& header_names, + const std::vector& compiler_flags, + jitify::experimental::file_callback_type file_callback, + cudaStream_t stream = 0 + ); + launcher(launcher&&); + launcher(const launcher&) = delete; + launcher& operator=(launcher&&) = delete; + launcher& operator=(const launcher&) = delete; + + /** + * @brief Sets the kernel to launch using this launcher + * + * Method to generate vector containing all template types for a JIT kernel. + * This vector is used to get the compiled kernel for one set of types and set + * it as the kernel to launch using this launcher. + * + * @param kernel_name The kernel to be launched + * @param arguments The template arguments to be used to instantiate the kernel + * @return launcher& ref to this launcehr object + */ + launcher& set_kernel_inst( + const std::string& kernel_name, + const std::vector& arguments + ) + { // program is a member variable of the launcher + kernel_inst = cache_instance.getKernelInstantiation(kernel_name, program, arguments); + return *this; + } + + /** + * @brief Handle the Jitify API to launch using information + * contained in the members of `this` + * + * @tparam grid and block sizes + * @return Return launcher reference if successful + */ + jitify::experimental::KernelLauncher configure( dim3 grid, dim3 block){ + return get_kernel().configure( grid, block); + //return get_kernel().configure_1d_max_occupancy( max_block_size=block.x); + } + + + /** + * @brief Handle the Jitify API to launch using information + * contained in the members of `this` + * + * @tparam All parameters to launch the kernel + * @return Return GDF_SUCCESS if successful + */ + template + void launch(Args ... args){ + get_kernel().configure_1d_max_occupancy(32, 0, 0, stream).launch(args...); + } + + private: + jit::GBJitCache& cache_instance; + jit::named_prog program; + jit::named_prog kernel_inst; + cudaStream_t stream; + + jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); } +}; + +} // namespace jit + +#endif diff --git a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cu b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cu new file mode 100644 index 0000000000..4c52e4d427 --- /dev/null +++ b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cu @@ -0,0 +1,93 @@ + +//------------------------------------------------------------------------------ +// GB_reduce_to_scalar_cuda.cu: reduce on the GPU with semiring +//------------------------------------------------------------------------------ + +// SPDX-License-Identifier: Apache-2.0 +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved. +// http://suitesparse.com See GraphBLAS/Doc/License.txt for license. + +//------------------------------------------------------------------------------ + +#include "GB_cuda.h" + +#include "templates/reduceWarp.cu.jit" +#include "templates/reduceNonZombiesWarp.cu.jit" +#include "test/semiringFactory.hpp" + +#include "GB_jit_launcher.h" +#include "GB_callback.hpp" + +GB_callback *SR_callback_ptr; + +std::istream* callback_wrapper( std::string file_name, std::iostream& tmp){ + return SR_callback_ptr->callback( file_name, tmp); +} + +const std::vector header_names ={}; + +GrB_Info GB_reduce_to_scalar_cuda +( + GB_void *s, + const GrB_Monoid reduce, + const GrB_Matrix A, + GB_Context Context +) +{ + + printf ("Hi I am %s :-)\n", __FILE__) ; + + // result = sum (Anz [0..anz-1]) using the GPU, + // with a kernel that has ntasks = grid.x and blocksize = blockDim.x + // nthreads = # of GPUs to use, but 1 for now + // We have a workspace W of size ntasks. + + thread_local static jitify::JitCache kernel_cache; + std::string reduce_kernel_name = "reduceNonZombiesWarp"; + + // stringified kernel specified above + jitify::Program program= kernel_cache.program( templates_reduceNonZombiesWarp_cu, 0, 0, + file_callback_plus); + //{"--use_fast_math", "-I/usr/local/cuda/include"}); + + int nnz = GB_NNZ( A ) ; + GrB_Type ctype = reduce->op->ztype ; + + int blocksize = 1024 ; + int ntasks = ( nnz + blocksize -1) / blocksize ; + + int32_t *block_sum; + //cudaMallocManaged ((void**) &block_sum, (num_reduce_blocks)*sizeof(int32_t)) ; + block_sum = (int32_t*)GB_cuda_malloc( (ntasks)*sizeof(int32_t)) ; + + dim3 red_grid(ntasks); + dim3 red_block(blocksize); + + GBURBLE ("(GPU reduce launch nblocks,blocksize= %d,%d )\n", ntasks, blocksize) ; + jit::launcher( reduce_kernel_name + "_" + reduce->op->name, + templates_reduceNonZombiesWarp_cu, + header_names, + compiler_flags, + callback_wrapper) + .set_kernel_inst( reduce_kernel_name , { ctype->name }) + .configure(red_grid, red_block) //if commented, use implicit 1D configure in launch + .launch( + A->i, // index vector, only sum up values >= 0 + A->x, // input pointer to vector to reduce, with zombies + block_sum, // Block sums on return + (unsigned int)nnz // length of vector to reduce to scalar + + ); + + cudaDeviceSynchronize(); + + + for (int i = 0 ; i < ntasks ; i++) + { + *s += (block_sum [i]) ; + } + + + return (GrB_SUCCESS) ; +} + diff --git a/GraphBLAS/CUDA/License.txt b/GraphBLAS/CUDA/License.txt new file mode 100644 index 0000000000..8ad4645770 --- /dev/null +++ b/GraphBLAS/CUDA/License.txt @@ -0,0 +1,36 @@ +This directory contains licensed OSS under the following terms: + +RMM +http://github.com/rapidsai/rmm +Apache-2.0 license + +CNMEM +http://github.com/NVIDIA/cnmem +BSD 3-Clause + +Jitify +http://github.com/NVIDIA/jitify +BSD 3-Clause "New" or "Revised" License + +CUB +http://github.com/NVIDIA/cub +BSD 3-Clause "New" or "Revised" License + +In addition, any source files not part of the above packages is hereby +licensed under the Apache-2.0 license. + +Copyright 2020, NVIDIA Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file expect in compilance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imlied. +See the License for specific language governing permissions and +limitations under the License. + + diff --git a/GraphBLAS/CUDA/Makefile b/GraphBLAS/CUDA/Makefile new file mode 100644 index 0000000000..de2d973810 --- /dev/null +++ b/GraphBLAS/CUDA/Makefile @@ -0,0 +1,135 @@ +#------------------------------------------------------------------------------- +# GraphBLAS/CUDA/Makefile +#------------------------------------------------------------------------------- + +# cuda 10.1+ is assumed + +all: library + +GXX ?= g++ +DOXYGEN ?= doxygen +CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80 + +CXX11 ?= 1 + +CUDA_DIR ?= /usr/local/cuda + +CXXFLAGS += -pthread + +ifeq ($(CXX11),1) + CXXFLAGS += -std=c++14 +endif + +EMBED_BEGIN = -rdynamic -Wl,-b,binary, +EMBED_END = ,-b,default + +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + CXXFLAGS += -D LINUX + CUDA_LIB_DIR = $(CUDA_DIR)/lib64 +else ifeq ($(UNAME_S),Darwin) + CUDA_LIB_DIR = $(CUDA_DIR)/lib +endif + +INC += -I$(CUDA_DIR)/include +LIB += -ldl -L$(CUDA_LIB_DIR) -lcuda -lcudart -lnvrtc + + +GCC ?= gcc + +SRC = GB*.cu +SRC2 = $(notdir $(wildcard $(SRC))) +OBJ = $(SRC2:.cu=.o) +cSRC = $(wildcard *.c) +cOBJ = $(cSRC:.c=.o) + +I = -I. -I../Source -I../Source/Template -I../Include -Irmm/rmm/include/ -Irmm/thirdparty/spdlog/include -Irmm/include/thirdparty/cnmem/include +SO_NAME = libgraphblascuda.so +SO_OPTS = --shared \ + -Xlinker -soname \ + -Xlinker $(SO_NAME) + +LIBS = -L/usr/local/cuda/lib64 -lcudadevrt -lcudart -lrmm -lspdlog -lcnmem + +CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75 \ + --relocatable-device-code true \ + --std=c++14 -Xcompiler -fPIC + +library: $(SO_NAME) + +HEADERS = jitify.hpp + +TEMPLATES := $(wildcard templates/*.cu) + +JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES)) + +%.cu: %.cutmp + cp $? $@ + +%.cu.jit: %.cu + ./stringify $? > $@ + +stringify: stringify.cpp + $(GXX) -o $@ $< -O3 -Wall + +doc: jitify.hpp Doxyfile + $(DOXYGEN) Doxyfile +.PHONY: doc + +test: $(cOBJ) + @echo $(cOBJ) + +$(cOBJ): %.o: %.c GB_cuda_stringify.h + $(GCC) $(I) -o $@ -c $< -O2 -Wall + +$(SO_NAME): $(OBJ) $(cOBJ) $(JIT_TEMP) GB_AxB_dot3_cuda.o + echo $(OBJ) + nvcc $(SO_OPTS) $(LIBS) $(OBJ) $(cOBJ) -o $@ + +GB_AxB_dot3_cuda.o: $(JIT_TEMP) matrix.h +%.o: %.cu + nvcc -c $(I) $(CUDA_OPTS) -o $@ $< $(LIBS) + + +config: + nvidia-smi + nvcc --version + @echo " " + @echo "SO_NAME: " $(SO_NAME) + @echo "SO_OPTS: " $(SO_OPTS) + @echo "LIBS: " $(LIBS) + @echo "CUDA_OPTS: " $(CUDA_OPTS) + @echo "SRC: " $(SRC) + @echo "OBJ: " $(OBJ) + @echo "I: " $(I) + @echo " " + gcc --version + icc --version + +clean: + rm -f *.o + rm -f stringify +.PHONY: clean + +distclean: clean + rm -f *.so *.a + +purge: distclean + +################################################################################ + + +EMBED_BEGIN = -rdynamic -Wl,-b,binary, +EMBED_END = ,-b,default + +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + CXXFLAGS += -D LINUX + CUDA_LIB_DIR = $(CUDA_DIR)/lib64 +else ifeq ($(UNAME_S),Darwin) + CUDA_LIB_DIR = $(CUDA_DIR)/lib +endif + + + + diff --git a/GraphBLAS/CUDA/Makefile.jitFactory b/GraphBLAS/CUDA/Makefile.jitFactory new file mode 100644 index 0000000000..eb6b0c7995 --- /dev/null +++ b/GraphBLAS/CUDA/Makefile.jitFactory @@ -0,0 +1,59 @@ + +GXX ?= g++ +DOXYGEN ?= doxygen +CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80 + +CXX11 ?= 0 +CXX14 ?= 1 + +CUDA_DIR ?= /usr/local/cuda + +CXXFLAGS += -pthread + +ifeq ($(CXX11),1) + CXXFLAGS += -std=c++11 +endif +ifeq ($(CXX14),1) + CXXFLAGS += -std=c++14 +endif + +EMBED_BEGIN = -rdynamic -Wl,-b,binary, +EMBED_END = ,-b,default + +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + CXXFLAGS += -D LINUX + CUDA_LIB_DIR = $(CUDA_DIR)/lib64 +else ifeq ($(UNAME_S),Darwin) + CUDA_LIB_DIR = $(CUDA_DIR)/lib +endif + +INC += -I$(CUDA_DIR)/include +LIB += -ldl -L$(CUDA_LIB_DIR) -lcuda -lcudart -lnvrtc + +HEADERS = jitify.hpp + +TEMPLATES := $(wildcard *.cutmp) + +JIT_TEMP := $(patsubst %.cutmp, %.cu.jit, $(TEMPLATES)) + +jitFactory: jitFactory.cpp $(HEADERS) $(JIT_TEMP) + $(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(LIB) + +%.cu: %.cutmp + cp $? $@ + +%.cu.jit: %.cu + ./stringify $? > $@ + +stringify: stringify.cpp + $(GXX) -o $@ $< -O3 -Wall + +doc: jitify.hpp Doxyfile + $(DOXYGEN) Doxyfile +.PHONY: doc + +clean: + rm -f stringify + rm -f jitFactory +.PHONY: clean diff --git a/GraphBLAS/CUDA/Makefile_new b/GraphBLAS/CUDA/Makefile_new new file mode 100644 index 0000000000..462bda5636 --- /dev/null +++ b/GraphBLAS/CUDA/Makefile_new @@ -0,0 +1,49 @@ +#------------------------------------------------------------------------------- +# GraphBLAS/CUDA/Makefile +#------------------------------------------------------------------------------- + +# cuda 10.1 is assumed + +SRC = GB*.cu +SRC2 = $(notdir $(wildcard $(SRC))) +OBJ = $(SRC2:.cu=.o) + +I = -I. -I../Source -I../Source/Template -I../Include +SO_NAME = libgraphblascuda.a + +LIBS = -L/usr/local/cuda/lib64 -lcudadevrt -lcudart + +CUDA_OPTS = -O2 --cudart=shared \ + --relocatable-device-code true \ + --std=c++11 -Xcompiler -fPIC + +$(SO_NAME): $(OBJ) + echo $(OBJ) + ar rv $@ $^ + +%.o: %.cu + nvcc -c $(I) $(CUDA_OPTS) -o $@ $< $(LIBS) + +config: + nvidia-smi + nvcc --version + @echo " " + @echo "SO_NAME: " $(SO_NAME) + @echo "SO_OPTS: " $(SO_OPTS) + @echo "LIBS: " $(LIBS) + @echo "CUDA_OPTS: " $(CUDA_OPTS) + @echo "SRC: " $(SRC) + @echo "OBJ: " $(OBJ) + @echo "I: " $(I) + @echo " " + gcc --version + icc --version + +clean: + rm -f *.o + +distclean: clean + rm -f *.so + +purge: distclean + diff --git a/GraphBLAS/CUDA/TODO.txt b/GraphBLAS/CUDA/TODO.txt new file mode 100644 index 0000000000..b80a11783c --- /dev/null +++ b/GraphBLAS/CUDA/TODO.txt @@ -0,0 +1,4 @@ + +TODO Get libgraphblascuda.a to work. +TODO why is pthread demo hanging with CUDA? + diff --git a/GraphBLAS/CUDA/binary_search.h b/GraphBLAS/CUDA/binary_search.h new file mode 100644 index 0000000000..a21d07e1e6 --- /dev/null +++ b/GraphBLAS/CUDA/binary_search.h @@ -0,0 +1,39 @@ + +#define GB_GETA( aval, ax, p) aval = (T_Z)ax[ ( p )] +#define GB_GETB( bval, bx, p) bval = (T_Z)bx[ ( p )] +#define GB_FLIP(i) (-(i)-2) +#define GB_IS_FLIPPED(i) ((i) < 0) +#define GB_IS_ZOMBIE(i) ((i) < 0) +#define GB_IS_NOT_FLIPPED(i) ((i) >= 0) +#define GB_IS_NOT_ZOMBIE(i) ((i) >= 0) +#define GB_UNFLIP(i) (((i) < 0) ? GB_FLIP(i) : (i)) + +//------------------------------------------------------------------------------ +// GB_BINARY_SEARCH +//------------------------------------------------------------------------------ + +// search for integer i in the list X [pleft...pright]; no zombies. +// The list X [pleft ... pright] is in ascending order. It may have +// duplicates. + +#define GB_BINARY_TRIM_SEARCH(i,X,pleft,pright) \ +{ \ + /* binary search of X [pleft ... pright] for integer i */ \ + while (pleft < pright) \ + { \ + int64_t pmiddle = (pleft + pright) / 2 ; \ + if (X [pmiddle] < i) \ + { \ + /* if in the list, it appears in [pmiddle+1..pright] */ \ + pleft = pmiddle + 1 ; \ + } \ + else \ + { \ + /* if in the list, it appears in [pleft..pmiddle] */ \ + pright = pmiddle ; \ + } \ + } \ + /* binary search is narrowed down to a single item */ \ + /* or it has found the list is empty */ \ + /*ASSERT (pleft == pright || pleft == pright + 1) ;*/ \ +} diff --git a/GraphBLAS/CUDA/dot.c b/GraphBLAS/CUDA/dot.c new file mode 100644 index 0000000000..16e40f06aa --- /dev/null +++ b/GraphBLAS/CUDA/dot.c @@ -0,0 +1,31 @@ + +consider these methods on the GPU: (see ../Source/Template/GB_AxB_dot_cij.c) + + while (pA < pA_end && pB < pB_end) + { + int64_t ia = Ai [pA] ; + int64_t ib = Bi [pB] ; + + #if 0 + if (ia == ib) + { + GB_DOT (ia, pA, pB) ; + pA++ ; + pB++ ; + } + else + { + pA += (ia < ib) ; + pB += (ib < ia) ; + } + #endif + + #if 0 + // this might be fastest on the GPU + #if GB_IS_PLUS_PAIR_REAL_SEMIRING && GB_CTYPE_IGNORE_OVERFLOW + cij += (ia == ib) ; + pA += (ia <= ib) ; + pB += (ib <= ia) ; + #endif + #endif + } diff --git a/GraphBLAS/CUDA/go b/GraphBLAS/CUDA/go new file mode 100755 index 0000000000..4d7e48c801 --- /dev/null +++ b/GraphBLAS/CUDA/go @@ -0,0 +1,3 @@ +#!/bin/bash +./jitFactory > o ; vim o + diff --git a/GraphBLAS/CUDA/jitify.hpp b/GraphBLAS/CUDA/jitify.hpp new file mode 100644 index 0000000000..e9ff891155 --- /dev/null +++ b/GraphBLAS/CUDA/jitify.hpp @@ -0,0 +1,4185 @@ +/* + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + ----------- + Jitify 0.9 + ----------- + A C++ library for easy integration of CUDA runtime compilation into + existing codes. + + -------------- + How to compile + -------------- + Compiler dependencies: , -std=c++11 + Linker dependencies: dl cuda nvrtc + + -------------------------------------- + Embedding source files into executable + -------------------------------------- + g++ ... -ldl -rdynamic -DJITIFY_ENABLE_EMBEDDED_FILES=1 + -Wl,-b,binary,my_kernel.cu,include/my_header.cuh,-b,default nvcc ... -ldl + -Xcompiler "-rdynamic + -Wl\,-b\,binary\,my_kernel.cu\,include/my_header.cuh\,-b\,default" + JITIFY_INCLUDE_EMBEDDED_FILE(my_kernel_cu); + JITIFY_INCLUDE_EMBEDDED_FILE(include_my_header_cuh); + + ---- + TODO + ---- + Extract valid compile options and pass the rest to cuModuleLoadDataEx + See if can have stringified headers automatically looked-up + by having stringify add them to a (static) global map. + The global map can be updated by creating a static class instance + whose constructor performs the registration. + Can then remove all headers from JitCache constructor in example code + See other TODOs in code +*/ + +/*! \file jitify.hpp + * \brief The Jitify library header + */ + +/*! \mainpage Jitify - A C++ library that simplifies the use of NVRTC + * \p Use class jitify::JitCache to manage and launch JIT-compiled CUDA + * kernels. + * + * \p Use namespace jitify::reflection to reflect types and values into + * code-strings. + * + * \p Use JITIFY_INCLUDE_EMBEDDED_FILE() to declare files that have been + * embedded into the executable using the GCC linker. + * + * \p Use jitify::parallel_for and JITIFY_LAMBDA() to generate and launch + * simple kernels. + */ + +#pragma once + +#ifndef JITIFY_THREAD_SAFE +#define JITIFY_THREAD_SAFE 1 +#endif + +#if JITIFY_ENABLE_EMBEDDED_FILES +#include +#endif +#include +#include +#include +#include // For strtok_r etc. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if JITIFY_THREAD_SAFE +#include +#endif + +#include +#include // For dim3, cudaStream_t +#if CUDA_VERSION >= 8000 +#define NVRTC_GET_TYPE_NAME 1 +#endif +#include + +// For use by get_current_executable_path(). +#ifdef __linux__ +#include // For PATH_MAX + +#include // For realpath +#define JITIFY_PATH_MAX PATH_MAX +#elif defined(_WIN32) || defined(_WIN64) +#include +#define JITIFY_PATH_MAX MAX_PATH +#else +#error "Unsupported platform" +#endif + +#ifdef _MSC_VER // MSVC compiler +#include // For UnDecorateSymbolName +#else +#include // For abi::__cxa_demangle +#endif + +#if defined(_WIN32) || defined(_WIN64) +// WAR for strtok_r being called strtok_s on Windows +#pragma push_macro("strtok_r") +#undef strtok_r +#define strtok_r strtok_s +// WAR for min and max possibly being macros defined by windows.h +#pragma push_macro("min") +#pragma push_macro("max") +#undef min +#undef max +#endif + +#ifndef JITIFY_PRINT_LOG +#define JITIFY_PRINT_LOG 1 +#endif + +#if JITIFY_PRINT_ALL +#define JITIFY_PRINT_INSTANTIATION 1 +#define JITIFY_PRINT_SOURCE 1 +#define JITIFY_PRINT_LOG 1 +#define JITIFY_PRINT_PTX 1 +#define JITIFY_PRINT_LINKER_LOG 1 +#define JITIFY_PRINT_LAUNCH 1 +#define JITIFY_PRINT_HEADER_PATHS 1 +#endif + +#if JITIFY_ENABLE_EMBEDDED_FILES +#define JITIFY_FORCE_UNDEFINED_SYMBOL(x) void* x##_forced = (void*)&x +/*! Include a source file that has been embedded into the executable using the + * GCC linker. + * \param name The name of the source file (not as a string), which must + * be sanitized by replacing non-alpha-numeric characters with underscores. + * E.g., \code{.cpp}JITIFY_INCLUDE_EMBEDDED_FILE(my_header_h)\endcode will + * include the embedded file "my_header.h". + * \note Files declared with this macro can be referenced using + * their original (unsanitized) filenames when creating a \p + * jitify::Program instance. + */ +#define JITIFY_INCLUDE_EMBEDDED_FILE(name) \ + extern "C" uint8_t _jitify_binary_##name##_start[] asm("_binary_" #name \ + "_start"); \ + extern "C" uint8_t _jitify_binary_##name##_end[] asm("_binary_" #name \ + "_end"); \ + JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_start); \ + JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_end) +#endif // JITIFY_ENABLE_EMBEDDED_FILES + +/*! Jitify library namespace + */ +namespace jitify { + +/*! Source-file load callback. + * + * \param filename The name of the requested source file. + * \param tmp_stream A temporary stream that can be used to hold source code. + * \return A pointer to an input stream containing the source code, or NULL + * to defer loading of the file to Jitify's file-loading mechanisms. + */ +typedef std::istream* (*file_callback_type)(std::string filename, + std::iostream& tmp_stream); + +// Exclude from Doxygen +//! \cond + +class JitCache; + +// Simple cache using LRU discard policy +template +class ObjectCache { + public: + typedef KeyType key_type; + typedef ValueType value_type; + + private: + typedef std::map object_map; + typedef std::deque key_rank; + typedef typename key_rank::iterator rank_iterator; + object_map _objects; + key_rank _ranked_keys; + size_t _capacity; + + inline void discard_old(size_t n = 0) { + if (n > _capacity) { + throw std::runtime_error("Insufficient capacity in cache"); + } + while (_objects.size() > _capacity - n) { + key_type discard_key = _ranked_keys.back(); + _ranked_keys.pop_back(); + _objects.erase(discard_key); + } + } + + public: + inline ObjectCache(size_t capacity = 8) : _capacity(capacity) {} + inline void resize(size_t capacity) { + _capacity = capacity; + this->discard_old(); + } + inline bool contains(const key_type& k) const { + return (bool)_objects.count(k); + } + inline void touch(const key_type& k) { + if (!this->contains(k)) { + throw std::runtime_error("Key not found in cache"); + } + rank_iterator rank = std::find(_ranked_keys.begin(), _ranked_keys.end(), k); + if (rank != _ranked_keys.begin()) { + // Move key to front of ranks + _ranked_keys.erase(rank); + _ranked_keys.push_front(k); + } + } + inline value_type& get(const key_type& k) { + if (!this->contains(k)) { + throw std::runtime_error("Key not found in cache"); + } + this->touch(k); + return _objects[k]; + } + inline value_type& insert(const key_type& k, + const value_type& v = value_type()) { + this->discard_old(1); + _ranked_keys.push_front(k); + return _objects.insert(std::make_pair(k, v)).first->second; + } + template + inline value_type& emplace(const key_type& k, Args&&... args) { + this->discard_old(1); + // Note: Use of piecewise_construct allows non-movable non-copyable types + auto iter = _objects + .emplace(std::piecewise_construct, std::forward_as_tuple(k), + std::forward_as_tuple(args...)) + .first; + _ranked_keys.push_front(iter->first); + return iter->second; + } +}; + +namespace detail { + +// Convenience wrapper for std::vector that provides handy constructors +template +class vector : public std::vector { + typedef std::vector super_type; + + public: + vector() : super_type() {} + vector(size_t n) : super_type(n) {} // Note: Not explicit, allows =0 + vector(std::vector const& vals) : super_type(vals) {} + template + vector(T const (&vals)[N]) : super_type(vals, vals + N) {} + vector(std::vector&& vals) : super_type(vals) {} + vector(std::initializer_list vals) : super_type(vals) {} +}; + +// Helper functions for parsing/manipulating source code + +inline std::string replace_characters(std::string str, + std::string const& oldchars, + char newchar) { + size_t i = str.find_first_of(oldchars); + while (i != std::string::npos) { + str[i] = newchar; + i = str.find_first_of(oldchars, i + 1); + } + return str; +} +inline std::string sanitize_filename(std::string name) { + return replace_characters(name, "/\\.-: ?%*|\"<>", '_'); +} + +#if JITIFY_ENABLE_EMBEDDED_FILES +class EmbeddedData { + void* _app; + EmbeddedData(EmbeddedData const&); + EmbeddedData& operator=(EmbeddedData const&); + + public: + EmbeddedData() { + _app = dlopen(NULL, RTLD_LAZY); + if (!_app) { + throw std::runtime_error(std::string("dlopen failed: ") + dlerror()); + } + dlerror(); // Clear any existing error + } + ~EmbeddedData() { + if (_app) { + dlclose(_app); + } + } + const uint8_t* operator[](std::string key) const { + key = sanitize_filename(key); + key = "_binary_" + key; + uint8_t const* data = (uint8_t const*)dlsym(_app, key.c_str()); + if (!data) { + throw std::runtime_error(std::string("dlsym failed: ") + dlerror()); + } + return data; + } + const uint8_t* begin(std::string key) const { + return (*this)[key + "_start"]; + } + const uint8_t* end(std::string key) const { return (*this)[key + "_end"]; } +}; +#endif // JITIFY_ENABLE_EMBEDDED_FILES + +inline bool is_tokenchar(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '_'; +} +inline std::string replace_token(std::string src, std::string token, + std::string replacement) { + size_t i = src.find(token); + while (i != std::string::npos) { + if (i == 0 || i == src.size() - token.size() || + (!is_tokenchar(src[i - 1]) && !is_tokenchar(src[i + token.size()]))) { + src.replace(i, token.size(), replacement); + i += replacement.size(); + } else { + i += token.size(); + } + i = src.find(token, i); + } + return src; +} +inline std::string path_base(std::string p) { + // "/usr/local/myfile.dat" -> "/usr/local" + // "foo/bar" -> "foo" + // "foo/bar/" -> "foo/bar" +#if defined _WIN32 || defined _WIN64 + char sep = '\\'; +#else + char sep = '/'; +#endif + size_t i = p.find_last_of(sep); + if (i != std::string::npos) { + return p.substr(0, i); + } else { + return ""; + } +} +inline std::string path_join(std::string p1, std::string p2) { +#ifdef _WIN32 + char sep = '\\'; +#else + char sep = '/'; +#endif + if (p1.size() && p2.size() && p2[0] == sep) { + throw std::invalid_argument("Cannot join to absolute path"); + } + if (p1.size() && p1[p1.size() - 1] != sep) { + p1 += sep; + } + return p1 + p2; +} +// Elides "/." and "/.." tokens from path. +inline std::string path_simplify(const std::string& path) { + std::vector dirs; + std::string cur_dir; + bool after_slash = false; + for (int i = 0; i < (int)path.size(); ++i) { + if (path[i] == '/') { + if (after_slash) continue; // Ignore repeat slashes + after_slash = true; + if (cur_dir == ".." && !dirs.empty() && dirs.back() != "..") { + if (dirs.size() == 1 && dirs.front().empty()) { + throw std::runtime_error( + "Invalid path: back-traversals exceed depth of absolute path"); + } + dirs.pop_back(); + } else if (cur_dir != ".") { // Ignore /./ + dirs.push_back(cur_dir); + } + cur_dir.clear(); + } else { + after_slash = false; + cur_dir.push_back(path[i]); + } + } + if (!after_slash) { + dirs.push_back(cur_dir); + } + std::stringstream ss; + for (int i = 0; i < (int)dirs.size() - 1; ++i) { + ss << dirs[i] << "/"; + } + if (!dirs.empty()) ss << dirs.back(); + if (after_slash) ss << "/"; + return ss.str(); +} +inline unsigned long long hash_larson64(const char* s, + unsigned long long seed = 0) { + unsigned long long hash = seed; + while (*s) { + hash = hash * 101 + *s++; + } + return hash; +} + +inline uint64_t hash_combine(uint64_t a, uint64_t b) { + // Note: The magic number comes from the golden ratio + return a ^ (0x9E3779B97F4A7C17ull + b + (b >> 2) + (a << 6)); +} + +inline bool extract_include_info_from_compile_error(std::string log, + std::string& name, + std::string& parent, + int& line_num) { + static const std::vector pattern = { + "could not open source file \"", "cannot open source file \""}; + + for (auto& p : pattern) { + size_t beg = log.find(p); + if (beg != std::string::npos) { + beg += p.size(); + size_t end = log.find("\"", beg); + name = log.substr(beg, end - beg); + + size_t line_beg = log.rfind("\n", beg); + if (line_beg == std::string::npos) { + line_beg = 0; + } else { + line_beg += 1; + } + + size_t split = log.find("(", line_beg); + parent = log.substr(line_beg, split - line_beg); + line_num = + atoi(log.substr(split + 1, log.find(")", split + 1) - (split + 1)) + .c_str()); + + return true; + } + } + + return false; +} + +inline bool is_include_directive_with_quotes(const std::string& source, + int line_num) { + // TODO: Check each find() for failure. + size_t beg = 0; + for (int i = 1; i < line_num; ++i) { + beg = source.find("\n", beg) + 1; + } + beg = source.find("include", beg) + 7; + beg = source.find_first_of("\"<", beg); + return source[beg] == '"'; +} + +inline std::string comment_out_code_line(int line_num, std::string source) { + size_t beg = 0; + for (int i = 1; i < line_num; ++i) { + beg = source.find("\n", beg) + 1; + } + return (source.substr(0, beg) + "//" + source.substr(beg)); +} + +inline void print_with_line_numbers(std::string const& source) { + int linenum = 1; + std::stringstream source_ss(source); + for (std::string line; std::getline(source_ss, line); ++linenum) { + std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line + << std::endl; + } +} + +inline void print_compile_log(std::string program_name, + std::string const& log) { + std::cout << "---------------------------------------------------" + << std::endl; + std::cout << "--- JIT compile log for " << program_name << " ---" + << std::endl; + std::cout << "---------------------------------------------------" + << std::endl; + std::cout << log << std::endl; + std::cout << "---------------------------------------------------" + << std::endl; +} + +inline std::vector split_string(std::string str, + long maxsplit = -1, + std::string delims = " \t") { + std::vector results; + if (maxsplit == 0) { + results.push_back(str); + return results; + } + // Note: +1 to include NULL-terminator + std::vector v_str(str.c_str(), str.c_str() + (str.size() + 1)); + char* c_str = v_str.data(); + char* saveptr = c_str; + char* token = nullptr; + for (long i = 0; i != maxsplit; ++i) { + token = ::strtok_r(c_str, delims.c_str(), &saveptr); + c_str = 0; + if (!token) { + return results; + } + results.push_back(token); + } + // Check if there's a final piece + token += ::strlen(token) + 1; + if (token - v_str.data() < (ptrdiff_t)str.size()) { + // Find the start of the final piece + token += ::strspn(token, delims.c_str()); + if (*token) { + results.push_back(token); + } + } + return results; +} + +static const std::map& get_jitsafe_headers_map(); + +inline bool load_source( + std::string filename, std::map& sources, + std::string current_dir = "", + std::vector include_paths = std::vector(), + file_callback_type file_callback = 0, + std::map* fullpaths = nullptr, + bool search_current_dir = true) { + std::istream* source_stream = 0; + std::stringstream string_stream; + std::ifstream file_stream; + // First detect direct source-code string ("my_program\nprogram_code...") + size_t newline_pos = filename.find("\n"); + if (newline_pos != std::string::npos) { + std::string source = filename.substr(newline_pos + 1); + filename = filename.substr(0, newline_pos); + string_stream << source; + source_stream = &string_stream; + } + if (sources.count(filename)) { + // Already got this one + return true; + } + if (!source_stream) { + std::string fullpath = path_join(current_dir, filename); + // Try loading from callback + if (!file_callback || + !(source_stream = file_callback(fullpath, string_stream))) { +#if JITIFY_ENABLE_EMBEDDED_FILES + // Try loading as embedded file + EmbeddedData embedded; + std::string source; + try { + source.assign(embedded.begin(fullpath), embedded.end(fullpath)); + string_stream << source; + source_stream = &string_stream; + } catch (std::runtime_error const&) +#endif // JITIFY_ENABLE_EMBEDDED_FILES + { + // Try loading from filesystem + bool found_file = false; + if (search_current_dir) { + file_stream.open(fullpath.c_str()); + if (file_stream) { + source_stream = &file_stream; + found_file = true; + } + } + // Search include directories + if (!found_file) { + for (int i = 0; i < (int)include_paths.size(); ++i) { + fullpath = path_join(include_paths[i], filename); + file_stream.open(fullpath.c_str()); + if (file_stream) { + source_stream = &file_stream; + found_file = true; + break; + } + } + if (!found_file) { + // Try loading from builtin headers + fullpath = path_join("__jitify_builtin", filename); + auto it = get_jitsafe_headers_map().find(filename); + if (it != get_jitsafe_headers_map().end()) { + string_stream << it->second; + source_stream = &string_stream; + } else { + return false; + } + } + } + } + } + if (fullpaths) { + // Record the full file path corresponding to this include name. + (*fullpaths)[filename] = path_simplify(fullpath); + } + } + sources[filename] = std::string(); + std::string& source = sources[filename]; + std::string line; + size_t linenum = 0; + unsigned long long hash = 0; + bool pragma_once = false; + bool remove_next_blank_line = false; + while (std::getline(*source_stream, line)) { + ++linenum; + + // HACK WAR for static variables not allowed on the device (unless + // __shared__) + // TODO: This breaks static member variables + // line = replace_token(line, "static const", "/*static*/ const"); + + // TODO: Need to watch out for /* */ comments too + std::string cleanline = + line.substr(0, line.find("//")); // Strip line comments + // if( cleanline.back() == "\r" ) { // Remove Windows line ending + // cleanline = cleanline.substr(0, cleanline.size()-1); + //} + // TODO: Should trim whitespace before checking .empty() + if (cleanline.empty() && remove_next_blank_line) { + remove_next_blank_line = false; + continue; + } + // Maintain a file hash for use in #pragma once WAR + hash = hash_larson64(line.c_str(), hash); + if (cleanline.find("#pragma once") != std::string::npos) { + pragma_once = true; + // Note: This is an attempt to recover the original line numbering, + // which otherwise gets off-by-one due to the include guard. + remove_next_blank_line = true; + // line = "//" + line; // Comment out the #pragma once line + continue; + } + + // HACK WAR for Thrust using "#define FOO #pragma bar" + size_t pragma_beg = cleanline.find("#pragma "); + if (pragma_beg != std::string::npos) { + std::string line_after_pragma = line.substr(pragma_beg); + std::vector pragma_split = + split_string(line_after_pragma, 2); + line = + (line.substr(0, pragma_beg) + "_Pragma(\"" + pragma_split[1] + "\")"); + if (pragma_split.size() == 3) { + line += " " + pragma_split[2]; + } + } + + source += line + "\n"; + } + // HACK TESTING (WAR for cub) + // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; + ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" + + /// source; + + // WAR for #pragma once causing problems when there are multiple inclusions + // of the same header from different paths. + if (pragma_once) { + std::stringstream ss; + ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0') + << hash; + std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n"; + std::string include_guard_header; + include_guard_header += "#ifndef " + include_guard_name; + include_guard_header += "#define " + include_guard_name; + std::string include_guard_footer; + include_guard_footer += "#endif // " + include_guard_name; + source = include_guard_header + source + "\n" + include_guard_footer; + } + // return filename; + return true; +} + +} // namespace detail + +//! \endcond + +/*! Jitify reflection utilities namespace + */ +namespace reflection { + +// Provides type and value reflection via a function 'reflect': +// reflect() -> "Type" +// reflect(value) -> "(T)value" +// reflect() -> "VAL" +// reflect -> "VAL" +// reflect_template,char>() -> "" +// reflect_template({"float", "7", "char"}) -> "" + +/*! A wrapper class for non-type template parameters. + */ +template +struct NonType { + constexpr static T VALUE = VALUE_; +}; + +// Forward declaration +template +inline std::string reflect(T const& value); + +//! \cond + +namespace detail { + +template +inline std::string value_string(const T& x) { + std::stringstream ss; + ss << x; + return ss.str(); +} +// WAR for non-printable characters +template <> +inline std::string value_string(const char& x) { + std::stringstream ss; + ss << (int)x; + return ss.str(); +} +template <> +inline std::string value_string(const signed char& x) { + std::stringstream ss; + ss << (int)x; + return ss.str(); +} +template <> +inline std::string value_string(const unsigned char& x) { + std::stringstream ss; + ss << (int)x; + return ss.str(); +} +template <> +inline std::string value_string(const wchar_t& x) { + std::stringstream ss; + ss << (long)x; + return ss.str(); +} +// Specialisation for bool true/false literals +template <> +inline std::string value_string(const bool& x) { + return x ? "true" : "false"; +} + +// Removes all tokens that start with double underscores. +inline void strip_double_underscore_tokens(char* s) { + using jitify::detail::is_tokenchar; + char* w = s; + do { + if (*s == '_' && *(s + 1) == '_') { + while (is_tokenchar(*++s)) + ; + } + } while ((*w++ = *s++)); +} + +//#if CUDA_VERSION < 8000 +#ifdef _MSC_VER // MSVC compiler +inline std::string demangle_cuda_symbol(const char* mangled_name) { + // We don't have a way to demangle CUDA symbol names under MSVC. + return mangled_name; +} +inline std::string demangle_native_type(const std::type_info& typeinfo) { + // Get the decorated name and skip over the leading '.'. + const char* decorated_name = typeinfo.raw_name() + 1; + char undecorated_name[4096]; + if (UnDecorateSymbolName( + decorated_name, undecorated_name, + sizeof(undecorated_name) / sizeof(*undecorated_name), + UNDNAME_NO_ARGUMENTS | // Treat input as a type name + UNDNAME_NAME_ONLY // No "class" and "struct" prefixes + /*UNDNAME_NO_MS_KEYWORDS*/)) { // No "__cdecl", "__ptr64" etc. + // WAR for UNDNAME_NO_MS_KEYWORDS messing up function types. + strip_double_underscore_tokens(undecorated_name); + return undecorated_name; + } + throw std::runtime_error("UnDecorateSymbolName failed"); +} +#else // not MSVC +inline std::string demangle_cuda_symbol(const char* mangled_name) { + size_t bufsize = 0; + char* buf = nullptr; + std::string demangled_name; + int status; + auto demangled_ptr = std::unique_ptr( + abi::__cxa_demangle(mangled_name, buf, &bufsize, &status), free); + if (status == 0) { + demangled_name = demangled_ptr.get(); // all worked as expected + } else if (status == -2) { + demangled_name = mangled_name; // we interpret this as plain C name + } else if (status == -1) { + throw std::runtime_error( + std::string("memory allocation failure in __cxa_demangle")); + } else if (status == -3) { + throw std::runtime_error(std::string("invalid argument to __cxa_demangle")); + } + return demangled_name; +} +inline std::string demangle_native_type(const std::type_info& typeinfo) { + return demangle_cuda_symbol(typeinfo.name()); +} +#endif // not MSVC +//#endif // CUDA_VERSION < 8000 + +template +class JitifyTypeNameWrapper_ {}; + +template +struct type_reflection { + inline static std::string name() { + //#if CUDA_VERSION < 8000 + // TODO: Use nvrtcGetTypeName once it has the same behavior as this. + // WAR for typeid discarding cv qualifiers on value-types + // Wrap type in dummy template class to preserve cv-qualifiers, then strip + // off the wrapper from the resulting string. + std::string wrapped_name = + demangle_native_type(typeid(JitifyTypeNameWrapper_)); + // Note: The reflected name of this class also has namespace prefixes. + const std::string wrapper_class_name = "JitifyTypeNameWrapper_<"; + size_t start = wrapped_name.find(wrapper_class_name); + if (start == std::string::npos) { + throw std::runtime_error("Type reflection failed: " + wrapped_name); + } + start += wrapper_class_name.size(); + std::string name = + wrapped_name.substr(start, wrapped_name.size() - (start + 1)); + return name; + //#else + // std::string ret; + // nvrtcResult status = nvrtcGetTypeName(&ret); + // if( status != NVRTC_SUCCESS ) { + // throw std::runtime_error(std::string("nvrtcGetTypeName + // failed: + //")+ nvrtcGetErrorString(status)); + // } + // return ret; + //#endif + } +}; // namespace detail +template +struct type_reflection > { + inline static std::string name() { + return jitify::reflection::reflect(VALUE); + } +}; + +} // namespace detail + +//! \endcond + +/*! Create an Instance object that contains a const reference to the + * value. We use this to wrap abstract objects from which we want to extract + * their type at runtime (e.g., derived type). This is used to facilitate + * templating on derived type when all we know at compile time is abstract + * type. + */ +template +struct Instance { + const T& value; + Instance(const T& value) : value(value) {} +}; + +/*! Create an Instance object from which we can extract the value's run-time + * type. + * \param value The const value to be captured. + */ +template +inline Instance instance_of(T const& value) { + return Instance(value); +} + +/*! A wrapper used for representing types as values. + */ +template +struct Type {}; + +// Type reflection +// E.g., reflect() -> "float" +// Note: This strips trailing const and volatile qualifiers +/*! Generate a code-string for a type. + * \code{.cpp}reflect() --> "float"\endcode + */ +template +inline std::string reflect() { + return detail::type_reflection::name(); +} +// Value reflection +// E.g., reflect(3.14f) -> "(float)3.14" +/*! Generate a code-string for a value. + * \code{.cpp}reflect(3.14f) --> "(float)3.14"\endcode + */ +template +inline std::string reflect(T const& value) { + return "(" + reflect() + ")" + detail::value_string(value); +} +// Non-type template arg reflection (implicit conversion to int64_t) +// E.g., reflect<7>() -> "(int64_t)7" +/*! Generate a code-string for an integer non-type template argument. + * \code{.cpp}reflect<7>() --> "(int64_t)7"\endcode + */ +template +inline std::string reflect() { + return reflect >(); +} +// Non-type template arg reflection (explicit type) +// E.g., reflect() -> "(int)7" +/*! Generate a code-string for a generic non-type template argument. + * \code{.cpp} reflect() --> "(int)7" \endcode + */ +template +inline std::string reflect() { + return reflect >(); +} +// Type reflection via value +// E.g., reflect(Type()) -> "float" +/*! Generate a code-string for a type wrapped as a Type instance. + * \code{.cpp}reflect(Type()) --> "float"\endcode + */ +template +inline std::string reflect(jitify::reflection::Type) { + return reflect(); +} + +/*! Generate a code-string for a type wrapped as an Instance instance. + * \code{.cpp}reflect(Instance(3.1f)) --> "float"\endcode + * or more simply when passed to a instance_of helper + * \code{.cpp}reflect(instance_of(3.1f)) --> "float"\endcodei + * This is specifically for the case where we want to extract the run-time + * type, e.g., derived type, of an object pointer. + */ +template +inline std::string reflect(jitify::reflection::Instance& value) { + return detail::demangle_native_type(typeid(value.value)); +} + +// Type from value +// E.g., type_of(3.14f) -> Type() +/*! Create a Type object representing a value's type. + * \param value The value whose type is to be captured. + */ +template +inline Type type_of(T& value) { + return Type(); +} +/*! Create a Type object representing a value's type. + * \param value The const value whose type is to be captured. + */ +template +inline Type type_of(T const& value) { + return Type(); +} + +// Multiple value reflections one call, returning list of strings +template +inline std::vector reflect_all(Args... args) { + return {reflect(args)...}; +} + +inline std::string reflect_list(jitify::detail::vector const& args, + std::string opener = "", + std::string closer = "") { + std::stringstream ss; + ss << opener; + for (int i = 0; i < (int)args.size(); ++i) { + if (i > 0) ss << ","; + ss << args[i]; + } + ss << closer; + return ss.str(); +} + +// Template instantiation reflection +// inline std::string reflect_template(std::vector const& args) { +inline std::string reflect_template( + jitify::detail::vector const& args) { + // Note: The space in " >" is a WAR to avoid '>>' appearing + return reflect_list(args, "<", " >"); +} +// TODO: See if can make this evaluate completely at compile-time +template +inline std::string reflect_template() { + return reflect_template({reflect()...}); + // return reflect_template({reflect()...}); +} + +} // namespace reflection + +//! \cond + +namespace detail { + +// Demangles nested variable names using the PTX name mangling scheme +// (which follows the Itanium64 ABI). E.g., _ZN1a3Foo2bcE -> a::Foo::bc. +inline std::string demangle_ptx_variable_name(const char* name) { + std::stringstream ss; + const char* c = name; + if (*c++ != '_' || *c++ != 'Z') return name; // Non-mangled name + if (*c++ != 'N') return ""; // Not a nested name, unsupported + while (true) { + // Parse identifier length. + int n = 0; + while (std::isdigit(*c)) { + n = n * 10 + (*c - '0'); + c++; + } + if (!n) return ""; // Invalid or unsupported mangled name + // Parse identifier. + const char* c0 = c; + while (n-- && *c) c++; + if (!*c) return ""; // Mangled name is truncated + std::string id(c0, c); + // Identifiers starting with "_GLOBAL" are anonymous namespaces. + ss << (id.substr(0, 7) == "_GLOBAL" ? "(anonymous namespace)" : id); + // Nested name specifiers end with 'E'. + if (*c == 'E') break; + // There are more identifiers to come, add join token. + ss << "::"; + } + return ss.str(); +} + +static const char* get_current_executable_path() { + static const char* path = []() -> const char* { + static char buffer[JITIFY_PATH_MAX] = {}; +#ifdef __linux__ + if (!::realpath("/proc/self/exe", buffer)) return nullptr; +#elif defined(_WIN32) || defined(_WIN64) + if (!GetModuleFileNameA(nullptr, buffer, JITIFY_PATH_MAX)) return nullptr; +#endif + return buffer; + }(); + return path; +} + +inline bool endswith(const std::string& str, const std::string& suffix) { + return str.size() >= suffix.size() && + str.substr(str.size() - suffix.size()) == suffix; +} + +// Infers the JIT input type from the filename suffix. If no known suffix is +// present, the filename is assumed to refer to a library, and the associated +// suffix (and possibly prefix) is automatically added to the filename. +inline CUjitInputType get_cuda_jit_input_type(std::string* filename) { + if (endswith(*filename, ".ptx")) { + return CU_JIT_INPUT_PTX; + } else if (endswith(*filename, ".cubin")) { + return CU_JIT_INPUT_CUBIN; + } else if (endswith(*filename, ".fatbin")) { + return CU_JIT_INPUT_FATBINARY; + } else if (endswith(*filename, +#if defined _WIN32 || defined _WIN64 + ".obj" +#else // Linux + ".o" +#endif + )) { + return CU_JIT_INPUT_OBJECT; + } else { // Assume library +#if defined _WIN32 || defined _WIN64 + if (!endswith(*filename, ".lib")) { + *filename += ".lib"; + } +#else // Linux + if (!endswith(*filename, ".a")) { + *filename = "lib" + *filename + ".a"; + } +#endif + return CU_JIT_INPUT_LIBRARY; + } +} + +class CUDAKernel { + std::vector _link_files; + std::vector _link_paths; + CUlinkState _link_state; + CUmodule _module; + CUfunction _kernel; + std::string _func_name; + std::string _ptx; + std::map _global_map; + std::vector _opts; + std::vector _optvals; +#ifdef JITIFY_PRINT_LINKER_LOG + static const unsigned int _log_size = 8192; + char _error_log[_log_size]; + char _info_log[_log_size]; +#endif + + inline void cuda_safe_call(CUresult res) const { + if (res != CUDA_SUCCESS) { + const char* msg; + cuGetErrorName(res, &msg); + throw std::runtime_error(msg); + } + } + inline void create_module(std::vector link_files, + std::vector link_paths) { + CUresult result; +#ifndef JITIFY_PRINT_LINKER_LOG + // WAR since linker log does not seem to be constructed using a single call + // to cuModuleLoadDataEx. + if (link_files.empty()) { + result = + cuModuleLoadDataEx(&_module, _ptx.c_str(), (unsigned)_opts.size(), + _opts.data(), _optvals.data()); + } else +#endif + { + cuda_safe_call(cuLinkCreate((unsigned)_opts.size(), _opts.data(), + _optvals.data(), &_link_state)); + cuda_safe_call(cuLinkAddData(_link_state, CU_JIT_INPUT_PTX, + (void*)_ptx.c_str(), _ptx.size(), + "jitified_source.ptx", 0, 0, 0)); + for (int i = 0; i < (int)link_files.size(); ++i) { + std::string link_file = link_files[i]; + CUjitInputType jit_input_type; + if (link_file == ".") { + // Special case for linking to current executable. + link_file = get_current_executable_path(); + jit_input_type = CU_JIT_INPUT_OBJECT; + } else { + // Infer based on filename. + jit_input_type = get_cuda_jit_input_type(&link_file); + } + CUresult result = cuLinkAddFile(_link_state, jit_input_type, + link_file.c_str(), 0, 0, 0); + int path_num = 0; + while (result == CUDA_ERROR_FILE_NOT_FOUND && + path_num < (int)link_paths.size()) { + std::string filename = path_join(link_paths[path_num++], link_file); + result = cuLinkAddFile(_link_state, jit_input_type, filename.c_str(), + 0, 0, 0); + } +#if JITIFY_PRINT_LINKER_LOG + if (result == CUDA_ERROR_FILE_NOT_FOUND) { + std::cerr << "Linker error: Device library not found: " << link_file + << std::endl; + } else if (result != CUDA_SUCCESS) { + std::cerr << "Linker error: Failed to add file: " << link_file + << std::endl; + std::cerr << _error_log << std::endl; + } +#endif + cuda_safe_call(result); + } + size_t cubin_size; + void* cubin; + result = cuLinkComplete(_link_state, &cubin, &cubin_size); + if (result == CUDA_SUCCESS) { + result = cuModuleLoadData(&_module, cubin); + } + } +#ifdef JITIFY_PRINT_LINKER_LOG + std::cout << "---------------------------------------" << std::endl; + std::cout << "--- Linker for " + << reflection::detail::demangle_cuda_symbol(_func_name.c_str()) + << " ---" << std::endl; + std::cout << "---------------------------------------" << std::endl; + std::cout << _info_log << std::endl; + std::cout << std::endl; + std::cout << _error_log << std::endl; + std::cout << "---------------------------------------" << std::endl; +#endif + cuda_safe_call(result); + // Allow _func_name to be empty to support cases where we want to generate + // PTX containing extern symbol definitions but no kernels. + if (!_func_name.empty()) { + cuda_safe_call( + cuModuleGetFunction(&_kernel, _module, _func_name.c_str())); + } + } + inline void destroy_module() { + if (_link_state) { + cuda_safe_call(cuLinkDestroy(_link_state)); + } + _link_state = 0; + if (_module) { + cuModuleUnload(_module); + } + _module = 0; + } + + // create a map of __constant__ and __device__ variables in the ptx file + // mapping demangled to mangled name + inline void create_global_variable_map() { + size_t pos = 0; + while (pos < _ptx.size()) { + pos = std::min(_ptx.find(".const .align", pos), + _ptx.find(".global .align", pos)); + if (pos == std::string::npos) break; + size_t end = _ptx.find_first_of(";=", pos); + if (_ptx[end] == '=') --end; + std::string line = _ptx.substr(pos, end - pos); + pos = end; + size_t symbol_start = line.find_last_of(" ") + 1; + size_t symbol_end = line.find_last_of("["); + std::string entry = line.substr(symbol_start, symbol_end - symbol_start); + std::string key = detail::demangle_ptx_variable_name(entry.c_str()); + // Skip unsupported mangled names. E.g., a static variable defined inside + // a function (such variables are not directly addressable from outside + // the function, so skipping them is the correct behavior). + if (key == "") continue; + _global_map[key] = entry; + } + } + + inline void set_linker_log() { +#ifdef JITIFY_PRINT_LINKER_LOG + _opts.push_back(CU_JIT_INFO_LOG_BUFFER); + _optvals.push_back((void*)_info_log); + _opts.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES); + _optvals.push_back((void*)(long)_log_size); + _opts.push_back(CU_JIT_ERROR_LOG_BUFFER); + _optvals.push_back((void*)_error_log); + _opts.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); + _optvals.push_back((void*)(long)_log_size); + _opts.push_back(CU_JIT_LOG_VERBOSE); + _optvals.push_back((void*)1); +#endif + } + + public: + inline CUDAKernel() : _link_state(0), _module(0), _kernel(0) {} + inline CUDAKernel(const CUDAKernel& other) = delete; + inline CUDAKernel& operator=(const CUDAKernel& other) = delete; + inline CUDAKernel(CUDAKernel&& other) = delete; + inline CUDAKernel& operator=(CUDAKernel&& other) = delete; + inline CUDAKernel(const char* func_name, const char* ptx, + std::vector link_files, + std::vector link_paths, unsigned int nopts = 0, + CUjit_option* opts = 0, void** optvals = 0) + : _link_files(link_files), + _link_paths(link_paths), + _link_state(0), + _module(0), + _kernel(0), + _func_name(func_name), + _ptx(ptx), + _opts(opts, opts + nopts), + _optvals(optvals, optvals + nopts) { + this->set_linker_log(); + this->create_module(link_files, link_paths); + this->create_global_variable_map(); + } + + inline CUDAKernel& set(const char* func_name, const char* ptx, + std::vector link_files, + std::vector link_paths, + unsigned int nopts = 0, CUjit_option* opts = 0, + void** optvals = 0) { + this->destroy_module(); + _func_name = func_name; + _ptx = ptx; + _link_files = link_files; + _link_paths = link_paths; + _opts.assign(opts, opts + nopts); + _optvals.assign(optvals, optvals + nopts); + this->set_linker_log(); + this->create_module(link_files, link_paths); + this->create_global_variable_map(); + return *this; + } + inline ~CUDAKernel() { this->destroy_module(); } + inline operator CUfunction() const { return _kernel; } + + inline CUresult launch(dim3 grid, dim3 block, unsigned int smem, + CUstream stream, std::vector arg_ptrs) const { + return cuLaunchKernel(_kernel, grid.x, grid.y, grid.z, block.x, block.y, + block.z, smem, stream, arg_ptrs.data(), NULL); + } + + inline CUdeviceptr get_global_ptr(const char* name, + size_t* size = nullptr) const { + CUdeviceptr global_ptr = 0; + auto global = _global_map.find(name); + if (global != _global_map.end()) { + cuda_safe_call(cuModuleGetGlobal(&global_ptr, size, _module, + global->second.c_str())); + } else { + throw std::runtime_error(std::string("failed to look up global ") + name); + } + return global_ptr; + } + + template + inline CUresult get_global_data(const char* name, T* data, size_t count, + CUstream stream = 0) const { + size_t size_bytes; + CUdeviceptr ptr = get_global_ptr(name, &size_bytes); + size_t given_size_bytes = count * sizeof(T); + if (given_size_bytes != size_bytes) { + throw std::runtime_error( + std::string("Value for global variable ") + name + + " has wrong size: got " + std::to_string(given_size_bytes) + + " bytes, expected " + std::to_string(size_bytes)); + } + return cuMemcpyDtoH(data, ptr, size_bytes); + } + + template + inline CUresult set_global_data(const char* name, const T* data, size_t count, + CUstream stream = 0) const { + size_t size_bytes; + CUdeviceptr ptr = get_global_ptr(name, &size_bytes); + size_t given_size_bytes = count * sizeof(T); + if (given_size_bytes != size_bytes) { + throw std::runtime_error( + std::string("Value for global variable ") + name + + " has wrong size: got " + std::to_string(given_size_bytes) + + " bytes, expected " + std::to_string(size_bytes)); + } + return cuMemcpyHtoD(ptr, data, size_bytes); + } + + const std::string& function_name() const { return _func_name; } + const std::string& ptx() const { return _ptx; } + const std::vector& link_files() const { return _link_files; } + const std::vector& link_paths() const { return _link_paths; } +}; + +static const char* jitsafe_header_preinclude_h = R"( +//// WAR for Thrust (which appears to have forgotten to include this in result_of_adaptable_function.h +//#include + +//// WAR for Thrust (which appear to have forgotten to include this in error_code.h) +//#include + +// WAR for Thrust (which only supports gnuc, clang or msvc) +#define __GNUC__ 4 + +// WAR for generics/shfl.h +#define THRUST_STATIC_ASSERT(x) + +// WAR for CUB +#ifdef __host__ +#undef __host__ +#endif +#define __host__ + +// WAR to allow exceptions to be parsed +#define try +#define catch(...) +)"; + + +static const char* jitsafe_header_float_h = R"( +#pragma once + +#define FLT_RADIX 2 +#define FLT_MANT_DIG 24 +#define DBL_MANT_DIG 53 +#define FLT_DIG 6 +#define DBL_DIG 15 +#define FLT_MIN_EXP -125 +#define DBL_MIN_EXP -1021 +#define FLT_MIN_10_EXP -37 +#define DBL_MIN_10_EXP -307 +#define FLT_MAX_EXP 128 +#define DBL_MAX_EXP 1024 +#define FLT_MAX_10_EXP 38 +#define DBL_MAX_10_EXP 308 +#define FLT_MAX 3.4028234e38f +#define DBL_MAX 1.7976931348623157e308 +#define FLT_EPSILON 1.19209289e-7f +#define DBL_EPSILON 2.220440492503130e-16 +#define FLT_MIN 1.1754943e-38f; +#define DBL_MIN 2.2250738585072013e-308 +#define FLT_ROUNDS 1 +#if defined __cplusplus && __cplusplus >= 201103L +#define FLT_EVAL_METHOD 0 +#define DECIMAL_DIG 21 +#endif +)"; + +static const char* jitsafe_header_limits_h = R"( +#pragma once + +#if defined _WIN32 || defined _WIN64 + #define __WORDSIZE 32 +#else + #if defined __x86_64__ && !defined __ILP32__ + #define __WORDSIZE 64 + #else + #define __WORDSIZE 32 + #endif +#endif +#define MB_LEN_MAX 16 +#define CHAR_BIT 8 +#define SCHAR_MIN (-128) +#define SCHAR_MAX 127 +#define UCHAR_MAX 255 +enum { + _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0, + CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN, + CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX, +}; +#define SHRT_MIN (-32768) +#define SHRT_MAX 32767 +#define USHRT_MAX 65535 +#define INT_MIN (-INT_MAX - 1) +#define INT_MAX 2147483647 +#define UINT_MAX 4294967295U +#if __WORDSIZE == 64 + # define LONG_MAX 9223372036854775807L +#else + # define LONG_MAX 2147483647L +#endif +#define LONG_MIN (-LONG_MAX - 1L) +#if __WORDSIZE == 64 + #define ULONG_MAX 18446744073709551615UL +#else + #define ULONG_MAX 4294967295UL +#endif +#define LLONG_MAX 9223372036854775807LL +#define LLONG_MIN (-LLONG_MAX - 1LL) +#define ULLONG_MAX 18446744073709551615ULL +)"; + +static const char* jitsafe_header_iterator = R"( +#pragma once + +namespace __jitify_iterator_ns { +struct output_iterator_tag {}; +struct input_iterator_tag {}; +struct forward_iterator_tag {}; +struct bidirectional_iterator_tag {}; +struct random_access_iterator_tag {}; +template +struct iterator_traits { + typedef typename Iterator::iterator_category iterator_category; + typedef typename Iterator::value_type value_type; + typedef typename Iterator::difference_type difference_type; + typedef typename Iterator::pointer pointer; + typedef typename Iterator::reference reference; +}; +template +struct iterator_traits { + typedef random_access_iterator_tag iterator_category; + typedef T value_type; + typedef ptrdiff_t difference_type; + typedef T* pointer; + typedef T& reference; +}; +template +struct iterator_traits { + typedef random_access_iterator_tag iterator_category; + typedef T value_type; + typedef ptrdiff_t difference_type; + typedef T const* pointer; + typedef T const& reference; +}; +} // namespace __jitify_iterator_ns +namespace std { using namespace __jitify_iterator_ns; } +using namespace __jitify_iterator_ns; +)"; + +// TODO: This is incomplete; need floating point limits +// Joe Eaton: added IEEE float and double types, none of the smaller types +// using type specific structs since we can't template on floats. +static const char* jitsafe_header_limits = R"( +#pragma once +#include +#include +// TODO: epsilon(), infinity(), etc +namespace __jitify_detail { +#if __cplusplus >= 201103L +#define JITIFY_CXX11_CONSTEXPR constexpr +#define JITIFY_CXX11_NOEXCEPT noexcept +#else +#define JITIFY_CXX11_CONSTEXPR +#define JITIFY_CXX11_NOEXCEPT +#endif + +struct FloatLimits { +#if __cplusplus >= 201103L + static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ + float lowest() JITIFY_CXX11_NOEXCEPT { return -FLT_MAX;} + static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ + float min() JITIFY_CXX11_NOEXCEPT { return FLT_MIN; } + static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ + float max() JITIFY_CXX11_NOEXCEPT { return FLT_MAX; } +#endif // __cplusplus >= 201103L + enum { + is_specialized = true, + is_signed = true, + is_integer = false, + is_exact = false, + has_infinity = true, + has_quiet_NaN = true, + has_signaling_NaN = true, + has_denorm = 1, + has_denorm_loss = true, + round_style = 1, + is_iec559 = true, + is_bounded = true, + is_modulo = false, + digits = 24, + digits10 = 6, + max_digits10 = 9, + radix = 2, + min_exponent = -125, + min_exponent10 = -37, + max_exponent = 128, + max_exponent10 = 38, + tinyness_before = false, + traps = false + }; +}; +struct DoubleLimits { +#if __cplusplus >= 201103L + static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ + double lowest() noexcept { return -DBL_MAX; } + static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ + double min() noexcept { return DBL_MIN; } + static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ + double max() noexcept { return DBL_MAX; } +#endif // __cplusplus >= 201103L + enum { + is_specialized = true, + is_signed = true, + is_integer = false, + is_exact = false, + has_infinity = true, + has_quiet_NaN = true, + has_signaling_NaN = true, + has_denorm = 1, + has_denorm_loss = true, + round_style = 1, + is_iec559 = true, + is_bounded = true, + is_modulo = false, + digits = 53, + digits10 = 15, + max_digits10 = 17, + radix = 2, + min_exponent = -1021, + min_exponent10 = -307, + max_exponent = 1024, + max_exponent10 = 308, + tinyness_before = false, + traps = false + }; +}; +template +struct IntegerLimits { + static inline __host__ __device__ T min() { return Min; } + static inline __host__ __device__ T max() { return Max; } +#if __cplusplus >= 201103L + static constexpr inline __host__ __device__ T lowest() noexcept { + return Min; + } +#endif // __cplusplus >= 201103L + enum { + is_specialized = true, + digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits, + digits10 = (digits * 30103) / 100000, + is_signed = ((T)(-1)<0), + is_integer = true, + is_exact = true, + radix = 2, + is_bounded = true, + is_modulo = false + }; +}; +} // namespace __jitify_detail +namespace std { using namespace __jitify_detail; } +namespace __jitify_limits_ns { +template struct numeric_limits { + enum { is_specialized = false }; +}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits {}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits {}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits {}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::IntegerLimits +{}; +//template struct numeric_limits { static const bool +//is_signed = ((T)(-1)<0); }; +template<> struct numeric_limits : public +__jitify_detail::FloatLimits +{}; +template<> struct numeric_limits : public +__jitify_detail::DoubleLimits +{}; +} // namespace __jitify_limits_ns +namespace std { using namespace __jitify_limits_ns; } +using namespace __jitify_limits_ns; +)"; + +// TODO: This is highly incomplete +static const char* jitsafe_header_type_traits = R"( + #pragma once + #if __cplusplus >= 201103L + namespace __jitify_type_traits_ns { + + template struct enable_if {}; + template struct enable_if { typedef T type; }; + #if __cplusplus >= 201402L + template< bool B, class T = void > using enable_if_t = typename enable_if::type; + #endif + + struct true_type { + enum { value = true }; + operator bool() const { return true; } + }; + struct false_type { + enum { value = false }; + operator bool() const { return false; } + }; + + template struct is_floating_point : false_type {}; + template<> struct is_floating_point : true_type {}; + template<> struct is_floating_point : true_type {}; + template<> struct is_floating_point : true_type {}; + + template struct is_integral : false_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + template<> struct is_integral : true_type {}; + + template struct is_signed : false_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + template<> struct is_signed : true_type {}; + + template struct is_unsigned : false_type {}; + template<> struct is_unsigned : true_type {}; + template<> struct is_unsigned : true_type {}; + template<> struct is_unsigned : true_type {}; + template<> struct is_unsigned : true_type {}; + template<> struct is_unsigned : true_type {}; + + template struct is_same : false_type {}; + template struct is_same : true_type {}; + + template struct is_array : false_type {}; + template struct is_array : true_type {}; + template struct is_array : true_type {}; + + //partial implementation only of is_function + template struct is_function : false_type { }; + template struct is_function : true_type {}; //regular + template struct is_function : true_type {}; // variadic + + template struct result_of; + template + struct result_of { + // TODO: This is a hack; a proper implem is quite complicated. + typedef typename F::result_type type; + }; + + template struct remove_reference { typedef T type; }; + template struct remove_reference { typedef T type; }; + template struct remove_reference { typedef T type; }; + #if __cplusplus >= 201402L + template< class T > using remove_reference_t = typename remove_reference::type; + #endif + + template struct remove_extent { typedef T type; }; + template struct remove_extent { typedef T type; }; + template struct remove_extent { typedef T type; }; + #if __cplusplus >= 201402L + template< class T > using remove_extent_t = typename remove_extent::type; + #endif + + template< class T > struct remove_const { typedef T type; }; + template< class T > struct remove_const { typedef T type; }; + template< class T > struct remove_volatile { typedef T type; }; + template< class T > struct remove_volatile { typedef T type; }; + template< class T > struct remove_cv { typedef typename remove_volatile::type>::type type; }; + #if __cplusplus >= 201402L + template< class T > using remove_cv_t = typename remove_cv::type; + template< class T > using remove_const_t = typename remove_const::type; + template< class T > using remove_volatile_t = typename remove_volatile::type; + #endif + + template struct conditional { typedef T type; }; + template struct conditional { typedef F type; }; + #if __cplusplus >= 201402L + template< bool B, class T, class F > using conditional_t = typename conditional::type; + #endif + + namespace __jitify_detail { + template< class T, bool is_function_type = false > struct add_pointer { using type = typename remove_reference::type*; }; + template< class T > struct add_pointer { using type = T; }; + template< class T, class... Args > struct add_pointer { using type = T(*)(Args...); }; + template< class T, class... Args > struct add_pointer { using type = T(*)(Args..., ...); }; + } + template< class T > struct add_pointer : __jitify_detail::add_pointer::value> {}; + #if __cplusplus >= 201402L + template< class T > using add_pointer_t = typename add_pointer::type; + #endif + + template< class T > struct decay { + private: + typedef typename remove_reference::type U; + public: + typedef typename conditional::value, typename remove_extent::type*, + typename conditional::value,typename add_pointer::type,typename remove_cv::type + >::type>::type type; + }; + #if __cplusplus >= 201402L + template< class T > using decay_t = typename decay::type; + #endif + + } // namespace __jtiify_type_traits_ns + namespace std { using namespace __jitify_type_traits_ns; } + using namespace __jitify_type_traits_ns; + #endif // c++11 +)"; + +// TODO: INT_FAST8_MAX et al. and a few other misc constants +static const char* jitsafe_header_stdint_h = + "#pragma once\n" + "#include \n" + "namespace __jitify_stdint_ns {\n" + "typedef signed char int8_t;\n" + "typedef signed short int16_t;\n" + "typedef signed int int32_t;\n" + "typedef signed long long int64_t;\n" + "typedef signed char int_fast8_t;\n" + "typedef signed short int_fast16_t;\n" + "typedef signed int int_fast32_t;\n" + "typedef signed long long int_fast64_t;\n" + "typedef signed char int_least8_t;\n" + "typedef signed short int_least16_t;\n" + "typedef signed int int_least32_t;\n" + "typedef signed long long int_least64_t;\n" + "typedef signed long long intmax_t;\n" + "typedef signed long intptr_t; //optional\n" + "typedef unsigned char uint8_t;\n" + "typedef unsigned short uint16_t;\n" + "typedef unsigned int uint32_t;\n" + "typedef unsigned long long uint64_t;\n" + "typedef unsigned char uint_fast8_t;\n" + "typedef unsigned short uint_fast16_t;\n" + "typedef unsigned int uint_fast32_t;\n" + "typedef unsigned long long uint_fast64_t;\n" + "typedef unsigned char uint_least8_t;\n" + "typedef unsigned short uint_least16_t;\n" + "typedef unsigned int uint_least32_t;\n" + "typedef unsigned long long uint_least64_t;\n" + "typedef unsigned long long uintmax_t;\n" + "typedef unsigned long uintptr_t; //optional\n" + "#define INT8_MIN SCHAR_MIN\n" + "#define INT16_MIN SHRT_MIN\n" + "#define INT32_MIN INT_MIN\n" + "#define INT64_MIN LLONG_MIN\n" + "#define INT8_MAX SCHAR_MAX\n" + "#define INT16_MAX SHRT_MAX\n" + "#define INT32_MAX INT_MAX\n" + "#define INT64_MAX LLONG_MAX\n" + "#define UINT8_MAX UCHAR_MAX\n" + "#define UINT16_MAX USHRT_MAX\n" + "#define UINT32_MAX UINT_MAX\n" + "#define UINT64_MAX ULLONG_MAX\n" + "#define INTPTR_MIN LONG_MIN\n" + "#define INTMAX_MIN LLONG_MIN\n" + "#define INTPTR_MAX LONG_MAX\n" + "#define INTMAX_MAX LLONG_MAX\n" + "#define UINTPTR_MAX ULONG_MAX\n" + "#define UINTMAX_MAX ULLONG_MAX\n" + "#define PTRDIFF_MIN INTPTR_MIN\n" + "#define PTRDIFF_MAX INTPTR_MAX\n" + "#define SIZE_MAX UINT64_MAX\n" + "} // namespace __jitify_stdint_ns\n" + "namespace std { using namespace __jitify_stdint_ns; }\n" + "using namespace __jitify_stdint_ns;\n"; + +// TODO: offsetof +static const char* jitsafe_header_stddef_h = + "#pragma once\n" + "#include \n" + "namespace __jitify_stddef_ns {\n" + "#if __cplusplus >= 201103L\n" + "typedef decltype(nullptr) nullptr_t;\n" + "#if defined(_MSC_VER)\n" + " typedef double max_align_t;\n" + "#elif defined(__APPLE__)\n" + " typedef long double max_align_t;\n" + "#else\n" + " // Define max_align_t to match the GCC definition.\n" + " typedef struct {\n" + " long long __jitify_max_align_nonce1\n" + " __attribute__((__aligned__(__alignof__(long long))));\n" + " long double __jitify_max_align_nonce2\n" + " __attribute__((__aligned__(__alignof__(long double))));\n" + " } max_align_t;\n" + "#endif\n" + "#endif // __cplusplus >= 201103L\n" + "#if __cplusplus >= 201703L\n" + "enum class byte : unsigned char {};\n" + "#endif // __cplusplus >= 201703L\n" + "} // namespace __jitify_stddef_ns\n" + "namespace std {\n" + " // NVRTC provides built-in definitions of ::size_t and ::ptrdiff_t.\n" + " using ::size_t;\n" + " using ::ptrdiff_t;\n" + " using namespace __jitify_stddef_ns;\n" + "} // namespace std\n" + "using namespace __jitify_stddef_ns;\n"; + +static const char* jitsafe_header_stdlib_h = + "#pragma once\n" + "#include \n"; +static const char* jitsafe_header_stdio_h = + "#pragma once\n" + "#include \n" + "#define FILE int\n" + "int fflush ( FILE * stream );\n" + "int fprintf ( FILE * stream, const char * format, ... );\n"; + +static const char* jitsafe_header_string_h = + "#pragma once\n" + "char* strcpy ( char * destination, const char * source );\n" + "int strcmp ( const char * str1, const char * str2 );\n" + "char* strerror( int errnum );\n"; + +static const char* jitsafe_header_cstring = + "#pragma once\n" + "\n" + "namespace __jitify_cstring_ns {\n" + "char* strcpy ( char * destination, const char * source );\n" + "int strcmp ( const char * str1, const char * str2 );\n" + "char* strerror( int errnum );\n" + "} // namespace __jitify_cstring_ns\n" + "namespace std { using namespace __jitify_cstring_ns; }\n" + "using namespace __jitify_cstring_ns;\n"; + +// HACK TESTING (WAR for cub) +static const char* jitsafe_header_iostream = + "#pragma once\n" + "#include \n" + "#include \n"; +// HACK TESTING (WAR for Thrust) +static const char* jitsafe_header_ostream = + "#pragma once\n" + "\n" + "namespace __jitify_ostream_ns {\n" + "template\n" // = std::char_traits + // >\n" + "struct basic_ostream {\n" + "};\n" + "typedef basic_ostream ostream;\n" + "ostream& endl(ostream& os);\n" + "ostream& operator<<( ostream&, ostream& (*f)( ostream& ) );\n" + "template< class CharT, class Traits > basic_ostream& endl( " + "basic_ostream& os );\n" + "template< class CharT, class Traits > basic_ostream& " + "operator<<( basic_ostream& os, const char* c );\n" + "#if __cplusplus >= 201103L\n" + "template< class CharT, class Traits, class T > basic_ostream& operator<<( basic_ostream&& os, const T& value );\n" + "#endif // __cplusplus >= 201103L\n" + "} // namespace __jitify_ostream_ns\n" + "namespace std { using namespace __jitify_ostream_ns; }\n" + "using namespace __jitify_ostream_ns;\n"; + +static const char* jitsafe_header_istream = + "#pragma once\n" + "\n" + "namespace __jitify_istream_ns {\n" + "template\n" // = std::char_traits + // >\n" + "struct basic_istream {\n" + "};\n" + "typedef basic_istream istream;\n" + "} // namespace __jitify_istream_ns\n" + "namespace std { using namespace __jitify_istream_ns; }\n" + "using namespace __jitify_istream_ns;\n"; + +static const char* jitsafe_header_sstream = + "#pragma once\n" + "#include \n" + "#include \n"; + +static const char* jitsafe_header_utility = + "#pragma once\n" + "namespace __jitify_utility_ns {\n" + "template\n" + "struct pair {\n" + " T1 first;\n" + " T2 second;\n" + " inline pair() {}\n" + " inline pair(T1 const& first_, T2 const& second_)\n" + " : first(first_), second(second_) {}\n" + " // TODO: Standard includes many more constructors...\n" + " // TODO: Comparison operators\n" + "};\n" + "template\n" + "pair make_pair(T1 const& first, T2 const& second) {\n" + " return pair(first, second);\n" + "}\n" + "} // namespace __jitify_utility_ns\n" + "namespace std { using namespace __jitify_utility_ns; }\n" + "using namespace __jitify_utility_ns;\n"; + +// TODO: incomplete +static const char* jitsafe_header_vector = + "#pragma once\n" + "namespace __jitify_vector_ns {\n" + "template\n" // = std::allocator> \n" + "struct vector {\n" + "};\n" + "} // namespace __jitify_vector_ns\n" + "namespace std { using namespace __jitify_vector_ns; }\n" + "using namespace __jitify_vector_ns;\n"; + +// TODO: incomplete +static const char* jitsafe_header_string = + "#pragma once\n" + "namespace __jitify_string_ns {\n" + "template\n" + "struct basic_string {\n" + "basic_string();\n" + "basic_string( const CharT* s );\n" //, const Allocator& alloc = + // Allocator() );\n" + "const CharT* c_str() const;\n" + "bool empty() const;\n" + "void operator+=(const char *);\n" + "void operator+=(const basic_string &);\n" + "};\n" + "typedef basic_string string;\n" + "} // namespace __jitify_string_ns\n" + "namespace std { using namespace __jitify_string_ns; }\n" + "using namespace __jitify_string_ns;\n"; + +// TODO: incomplete +static const char* jitsafe_header_stdexcept = + "#pragma once\n" + "namespace __jitify_stdexcept_ns {\n" + "struct runtime_error {\n" + "explicit runtime_error( const std::string& what_arg );" + "explicit runtime_error( const char* what_arg );" + "virtual const char* what() const;\n" + "};\n" + "} // namespace __jitify_stdexcept_ns\n" + "namespace std { using namespace __jitify_stdexcept_ns; }\n" + "using namespace __jitify_stdexcept_ns;\n"; + +// TODO: incomplete +static const char* jitsafe_header_complex = + "#pragma once\n" + "namespace __jitify_complex_ns {\n" + "template\n" + "class complex {\n" + " T _real;\n" + " T _imag;\n" + "public:\n" + " complex() : _real(0), _imag(0) {}\n" + " complex(T const& real, T const& imag)\n" + " : _real(real), _imag(imag) {}\n" + " complex(T const& real)\n" + " : _real(real), _imag(static_cast(0)) {}\n" + " T const& real() const { return _real; }\n" + " T& real() { return _real; }\n" + " void real(const T &r) { _real = r; }\n" + " T const& imag() const { return _imag; }\n" + " T& imag() { return _imag; }\n" + " void imag(const T &i) { _imag = i; }\n" + " complex& operator+=(const complex z)\n" + " { _real += z.real(); _imag += z.imag(); return *this; }\n" + "};\n" + "template\n" + "complex operator*(const complex& lhs, const complex& rhs)\n" + " { return complex(lhs.real()*rhs.real()-lhs.imag()*rhs.imag(),\n" + " lhs.real()*rhs.imag()+lhs.imag()*rhs.real()); }\n" + "template\n" + "complex operator*(const complex& lhs, const T & rhs)\n" + " { return complexs(lhs.real()*rhs,lhs.imag()*rhs); }\n" + "template\n" + "complex operator*(const T& lhs, const complex& rhs)\n" + " { return complexs(rhs.real()*lhs,rhs.imag()*lhs); }\n" + "} // namespace __jitify_complex_ns\n" + "namespace std { using namespace __jitify_complex_ns; }\n" + "using namespace __jitify_complex_ns;\n"; + +// TODO: This is incomplete (missing binary and integer funcs, macros, +// constants, types) +static const char* jitsafe_header_math = + "#pragma once\n" + "namespace __jitify_math_ns {\n" + "#if __cplusplus >= 201103L\n" + "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n" + " inline double f(double x) { return ::f(x); } \\\n" + " inline float f##f(float x) { return ::f(x); } \\\n" + " /*inline long double f##l(long double x) { return ::f(x); }*/ \\\n" + " inline float f(float x) { return ::f(x); } \\\n" + " /*inline long double f(long double x) { return ::f(x); }*/\n" + "#else\n" + "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n" + " inline double f(double x) { return ::f(x); } \\\n" + " inline float f##f(float x) { return ::f(x); } \\\n" + " /*inline long double f##l(long double x) { return ::f(x); }*/\n" + "#endif\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)\n" + "template inline T atan2(T y, T x) { return ::atan2(y, x); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)\n" + "template inline T frexp(T x, int* exp) { return ::frexp(x, " + "exp); }\n" + "template inline T ldexp(T x, int exp) { return ::ldexp(x, " + "exp); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(log)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)\n" + "template inline T modf(T x, T* intpart) { return ::modf(x, " + "intpart); }\n" + "template inline T pow(T x, T y) { return ::pow(x, y); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)\n" + "template inline T fmod(T n, T d) { return ::fmod(n, d); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)\n" + "template inline T abs(T x) { return ::abs(x); }\n" + "#if __cplusplus >= 201103L\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)\n" + "template inline int ilogb(T x) { return ::ilogb(x); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)\n" + "template inline T scalbn (T x, int n) { return ::scalbn(x, " + "n); }\n" + "template inline T scalbln(T x, long n) { return ::scalbn(x, " + "n); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)\n" + "template inline T hypot(T x, T y) { return ::hypot(x, y); }\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(round)\n" + "template inline long lround(T x) { return ::lround(x); }\n" + "template inline long long llround(T x) { return ::llround(x); " + "}\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)\n" + "template inline long lrint(T x) { return ::lrint(x); }\n" + "template inline long long llrint(T x) { return ::llrint(x); " + "}\n" + "DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)\n" + // TODO: remainder, remquo, copysign, nan, nextafter, nexttoward, fdim, + // fmax, fmin, fma + "#endif\n" + "#undef DEFINE_MATH_UNARY_FUNC_WRAPPER\n" + "} // namespace __jitify_math_ns\n" + "namespace std { using namespace __jitify_math_ns; }\n" + "#define M_PI 3.14159265358979323846\n" + // Note: Global namespace already includes CUDA math funcs + "//using namespace __jitify_math_ns;\n"; + +static const char* jitsafe_header_memory_h = R"( + #pragma once + #include + )"; + +// TODO: incomplete +static const char* jitsafe_header_mutex = R"( + #pragma once + #if __cplusplus >= 201103L + namespace __jitify_mutex_ns { + class mutex { + public: + void lock(); + bool try_lock(); + void unlock(); + }; + } // namespace __jitify_mutex_ns + namespace std { using namespace __jitify_mutex_ns; } + using namespace __jitify_mutex_ns; + #endif + )"; + +static const char* jitsafe_header_algorithm = R"( + #pragma once + #if __cplusplus >= 201103L + namespace __jitify_algorithm_ns { + + #if __cplusplus == 201103L + #define JITIFY_CXX14_CONSTEXPR + #else + #define JITIFY_CXX14_CONSTEXPR constexpr + #endif + + template JITIFY_CXX14_CONSTEXPR const T& max(const T& a, const T& b) + { + return (b > a) ? b : a; + } + template JITIFY_CXX14_CONSTEXPR const T& min(const T& a, const T& b) + { + return (b < a) ? b : a; + } + + } // namespace __jitify_algorithm_ns + namespace std { using namespace __jitify_algorithm_ns; } + using namespace __jitify_algorithm_ns; + #endif + )"; + +static const char* jitsafe_header_time_h = R"( + #pragma once + #define NULL 0 + #define CLOCKS_PER_SEC 1000000 + namespace __jitify_time_ns { + typedef long time_t; + struct tm { + int tm_sec; + int tm_min; + int tm_hour; + int tm_mday; + int tm_mon; + int tm_year; + int tm_wday; + int tm_yday; + int tm_isdst; + }; + #if __cplusplus >= 201703L + struct timespec { + time_t tv_sec; + long tv_nsec; + }; + #endif + } // namespace __jitify_time_ns + namespace std { + // NVRTC provides built-in definitions of ::size_t and ::clock_t. + using ::size_t; + using ::clock_t; + using namespace __jitify_time_ns; + } + using namespace __jitify_time_ns; + )"; + +// WAR: These need to be pre-included as a workaround for NVRTC implicitly using +// /usr/include as an include path. The other built-in headers will be included +// lazily as needed. +static const char* preinclude_jitsafe_header_names[] = { + "jitify_preinclude.h", + "limits.h", + "math.h", + "memory.h", + "stdint.h", + "stdlib.h", + "stdio.h", + "string.h", + "time.h", +}; + +template +int array_size(T (&)[N]) { + return N; +} +const int preinclude_jitsafe_headers_count = + array_size(preinclude_jitsafe_header_names); + +static const std::map& get_jitsafe_headers_map() { + static const std::map jitsafe_headers_map = { + {"jitify_preinclude.h", jitsafe_header_preinclude_h}, + {"float.h", jitsafe_header_float_h}, + {"cfloat", jitsafe_header_float_h}, + {"limits.h", jitsafe_header_limits_h}, + {"climits", jitsafe_header_limits_h}, + {"stdint.h", jitsafe_header_stdint_h}, + {"cstdint", jitsafe_header_stdint_h}, + {"stddef.h", jitsafe_header_stddef_h}, + {"cstddef", jitsafe_header_stddef_h}, + {"stdlib.h", jitsafe_header_stdlib_h}, + {"cstdlib", jitsafe_header_stdlib_h}, + {"stdio.h", jitsafe_header_stdio_h}, + {"cstdio", jitsafe_header_stdio_h}, + {"string.h", jitsafe_header_string_h}, + {"cstring", jitsafe_header_cstring}, + {"iterator", jitsafe_header_iterator}, + {"limits", jitsafe_header_limits}, + {"type_traits", jitsafe_header_type_traits}, + {"utility", jitsafe_header_utility}, + {"math.h", jitsafe_header_math}, + {"cmath", jitsafe_header_math}, + {"memory.h", jitsafe_header_memory_h}, + {"complex", jitsafe_header_complex}, + {"iostream", jitsafe_header_iostream}, + {"ostream", jitsafe_header_ostream}, + {"istream", jitsafe_header_istream}, + {"sstream", jitsafe_header_sstream}, + {"vector", jitsafe_header_vector}, + {"string", jitsafe_header_string}, + {"stdexcept", jitsafe_header_stdexcept}, + {"mutex", jitsafe_header_mutex}, + {"algorithm", jitsafe_header_algorithm}, + {"time.h", jitsafe_header_time_h}, + {"ctime", jitsafe_header_time_h}, + }; + return jitsafe_headers_map; +} + +inline void add_options_from_env(std::vector& options) { + // Add options from environment variable + const char* env_options = std::getenv("JITIFY_OPTIONS"); + if (env_options) { + std::stringstream ss; + ss << env_options; + std::string opt; + while (!(ss >> opt).fail()) { + options.push_back(opt); + } + } + // Add options from JITIFY_OPTIONS macro +#ifdef JITIFY_OPTIONS +#define JITIFY_TOSTRING_IMPL(x) #x +#define JITIFY_TOSTRING(x) JITIFY_TOSTRING_IMPL(x) + std::stringstream ss; + ss << JITIFY_TOSTRING(JITIFY_OPTIONS); + std::string opt; + while (!(ss >> opt).fail()) { + options.push_back(opt); + } +#undef JITIFY_TOSTRING +#undef JITIFY_TOSTRING_IMPL +#endif // JITIFY_OPTIONS +} + +inline void detect_and_add_cuda_arch(std::vector& options) { + for (int i = 0; i < (int)options.size(); ++i) { + // Note that this will also match the middle of "--gpu-architecture". + if (options[i].find("-arch") != std::string::npos) { + // Arch already specified in options + return; + } + } + // Use the compute capability of the current device + // TODO: Check these API calls for errors + cudaError_t status; + int device; + status = cudaGetDevice(&device); + if (status != cudaSuccess) { + throw std::runtime_error( + std::string( + "Failed to detect GPU architecture: cudaGetDevice failed: ") + + cudaGetErrorString(status)); + } + int cc_major; + cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device); + int cc_minor; + cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device); + int cc = cc_major * 10 + cc_minor; + // Note: We must limit the architecture to the max supported by the current + // version of NVRTC, otherwise newer hardware will cause errors + // on older versions of CUDA. + // TODO: It would be better to detect this somehow, rather than hard-coding it + + // Tegra chips do not have forwards compatibility so we need to special case + // them. + bool is_tegra = ((cc_major == 3 && cc_minor == 2) || // Logan + (cc_major == 5 && cc_minor == 3) || // Erista + (cc_major == 6 && cc_minor == 2) || // Parker + (cc_major == 7 && cc_minor == 2)); // Xavier + if (!is_tegra) { + // ensure that future CUDA versions just work (even if suboptimal) + const int cuda_major = std::min(10, CUDA_VERSION / 1000); + // clang-format off + switch (cuda_major) { + case 10: cc = std::min(cc, 75); break; // Turing + case 9: cc = std::min(cc, 70); break; // Volta + case 8: cc = std::min(cc, 61); break; // Pascal + case 7: cc = std::min(cc, 52); break; // Maxwell + default: + throw std::runtime_error("Unexpected CUDA major version " + + std::to_string(cuda_major)); + } + // clang-format on + } + + std::stringstream ss; + ss << cc; + options.push_back("-arch=compute_" + ss.str()); +} + +inline void detect_and_add_cxx11_flag(std::vector& options) { + // Reverse loop so we can erase on the fly. + for (int i = (int)options.size() - 1; i >= 0; --i) { + if (options[i].find("-std=c++98") != std::string::npos) { + // NVRTC doesn't support specifying c++98 explicitly, so we remove it. + options.erase(options.begin() + i); + return; + } else if (options[i].find("-std") != std::string::npos) { + // Some other standard was explicitly specified, don't change anything. + return; + } + } + // Jitify must be compiled with C++11 support, so we default to enabling it + // for the JIT-compiled code too. + options.push_back("-std=c++11"); +} + +inline void split_compiler_and_linker_options( + std::vector options, + std::vector* compiler_options, + std::vector* linker_files, + std::vector* linker_paths) { + for (int i = 0; i < (int)options.size(); ++i) { + std::string opt = options[i]; + std::string flag = opt.substr(0, 2); + std::string value = opt.substr(2); + if (flag == "-l") { + linker_files->push_back(value); + } else if (flag == "-L") { + linker_paths->push_back(value); + } else { + compiler_options->push_back(opt); + } + } +} + +inline bool pop_remove_unused_globals_flag(std::vector* options) { + auto it = std::remove_if( + options->begin(), options->end(), [](const std::string& opt) { + return opt.find("-remove-unused-globals") != std::string::npos; + }); + if (it != options->end()) { + options->resize(it - options->begin()); + return true; + } + return false; +} + +inline std::string ptx_parse_decl_name(const std::string& line) { + size_t name_end = line.find_first_of("[;"); + if (name_end == std::string::npos) { + throw std::runtime_error( + "Failed to parse .global/.const declaration in PTX: expected a " + "semicolon"); + } + size_t name_start_minus1 = line.find_last_of(" \t", name_end); + if (name_start_minus1 == std::string::npos) { + throw std::runtime_error( + "Failed to parse .global/.const declaration in PTX: expected " + "whitespace"); + } + size_t name_start = name_start_minus1 + 1; + std::string name = line.substr(name_start, name_end - name_start); + return name; +} + +inline void ptx_remove_unused_globals(std::string* ptx) { + std::istringstream iss(*ptx); + std::vector lines; + std::unordered_map line_num_to_global_name; + std::unordered_set name_set; + for (std::string line; std::getline(iss, line);) { + size_t line_num = lines.size(); + lines.push_back(line); + auto terms = split_string(line); + if (terms.size() <= 1) continue; // Ignore lines with no arguments + if (terms[0].substr(0, 2) == "//") continue; // Ignore comment lines + if (terms[0].substr(0, 7) == ".global" || + terms[0].substr(0, 6) == ".const") { + line_num_to_global_name.emplace(line_num, ptx_parse_decl_name(line)); + continue; + } + if (terms[0][0] == '.') continue; // Ignore .version, .reg, .param etc. + // Note: The first term will always be an instruction name; starting at 1 + // also allows unchecked inspection of the previous term. + for (int i = 1; i < (int)terms.size(); ++i) { + if (terms[i].substr(0, 2) == "//") break; // Ignore comments + // Note: The characters '.' and '%' are not treated as delimiters. + const char* token_delims = " \t()[]{},;+-*/~&|^?:=!<>\"'\\"; + for (auto token : split_string(terms[i], -1, token_delims)) { + if ( // Ignore non-names + !(std::isalpha(token[0]) || token[0] == '_' || token[0] == '$') || + token.find('.') != std::string::npos || + // Ignore variable/parameter declarations + terms[i - 1][0] == '.' || + // Ignore branch instructions + (token == "bra" && terms[i - 1][0] == '@') || + // Ignore branch labels + (token.substr(0, 2) == "BB" && + terms[i - 1].substr(0, 3) == "bra")) { + continue; + } + name_set.insert(token); + } + } + } + std::ostringstream oss; + for (size_t line_num = 0; line_num < lines.size(); ++line_num) { + auto it = line_num_to_global_name.find(line_num); + if (it != line_num_to_global_name.end()) { + const std::string& name = it->second; + if (!name_set.count(name)) { + continue; // Remove unused .global declaration. + } + } + oss << lines[line_num] << '\n'; + } + *ptx = oss.str(); +} + +inline nvrtcResult compile_kernel(std::string program_name, + std::map sources, + std::vector options, + std::string instantiation = "", + std::string* log = 0, std::string* ptx = 0, + std::string* mangled_instantiation = 0) { + std::string program_source = sources[program_name]; + // Build arrays of header names and sources + std::vector header_names_c; + std::vector header_sources_c; + int num_headers = (int)(sources.size() - 1); + header_names_c.reserve(num_headers); + header_sources_c.reserve(num_headers); + typedef std::map source_map; + for (source_map::const_iterator iter = sources.begin(); iter != sources.end(); + ++iter) { + std::string const& name = iter->first; + std::string const& code = iter->second; + if (name == program_name) { + continue; + } + header_names_c.push_back(name.c_str()); + header_sources_c.push_back(code.c_str()); + } + + // TODO: This WAR is expected to be unnecessary as of CUDA > 10.2. + bool should_remove_unused_globals = + detail::pop_remove_unused_globals_flag(&options); + + std::vector options_c(options.size() + 2); + options_c[0] = "--device-as-default-execution-space"; + options_c[1] = "--pre-include=jitify_preinclude.h"; + for (int i = 0; i < (int)options.size(); ++i) { + options_c[i + 2] = options[i].c_str(); + } + +#if CUDA_VERSION < 8000 + std::string inst_dummy; + if (!instantiation.empty()) { + // WAR for no nvrtcAddNameExpression before CUDA 8.0 + // Force template instantiation by adding dummy reference to kernel + inst_dummy = "__jitify_instantiation"; + program_source += + "\nvoid* " + inst_dummy + " = (void*)" + instantiation + ";\n"; + } +#endif + +#define CHECK_NVRTC(call) \ + do { \ + nvrtcResult ret = call; \ + if (ret != NVRTC_SUCCESS) { \ + return ret; \ + } \ + } while (0) + + nvrtcProgram nvrtc_program; + CHECK_NVRTC(nvrtcCreateProgram( + &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers, + header_sources_c.data(), header_names_c.data())); + +#if CUDA_VERSION >= 8000 + if (!instantiation.empty()) { + CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str())); + } +#endif + + nvrtcResult ret = nvrtcCompileProgram(nvrtc_program, (int)options_c.size(), + options_c.data()); + if (log) { + size_t logsize; + CHECK_NVRTC(nvrtcGetProgramLogSize(nvrtc_program, &logsize)); + std::vector vlog(logsize, 0); + CHECK_NVRTC(nvrtcGetProgramLog(nvrtc_program, vlog.data())); + log->assign(vlog.data(), logsize); + } + if (ret != NVRTC_SUCCESS) { + return ret; + } + + if (ptx) { + size_t ptxsize; + CHECK_NVRTC(nvrtcGetPTXSize(nvrtc_program, &ptxsize)); + std::vector vptx(ptxsize); + CHECK_NVRTC(nvrtcGetPTX(nvrtc_program, vptx.data())); + ptx->assign(vptx.data(), ptxsize); + if (should_remove_unused_globals) { + detail::ptx_remove_unused_globals(ptx); + } + } + + if (!instantiation.empty() && mangled_instantiation) { +#if CUDA_VERSION >= 8000 + const char* mangled_instantiation_cstr; + // Note: The returned string pointer becomes invalid after + // nvrtcDestroyProgram has been called, so we save it. + CHECK_NVRTC(nvrtcGetLoweredName(nvrtc_program, instantiation.c_str(), + &mangled_instantiation_cstr)); + *mangled_instantiation = mangled_instantiation_cstr; +#else + // Extract mangled kernel template instantiation from PTX + inst_dummy += " = "; // Note: This must match how the PTX is generated + int mi_beg = ptx->find(inst_dummy) + inst_dummy.size(); + int mi_end = ptx->find(";", mi_beg); + *mangled_instantiation = ptx->substr(mi_beg, mi_end - mi_beg); +#endif + } + + CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program)); +#undef CHECK_NVRTC + return NVRTC_SUCCESS; +} + +inline void load_program(std::string const& cuda_source, + std::vector const& headers, + file_callback_type file_callback, + std::vector* include_paths, + std::map* program_sources, + std::vector* program_options, + std::string* program_name) { + // Extract include paths from compile options + std::vector::iterator iter = program_options->begin(); + while (iter != program_options->end()) { + std::string const& opt = *iter; + if (opt.substr(0, 2) == "-I") { + include_paths->push_back(opt.substr(2)); + iter = program_options->erase(iter); + } else { + ++iter; + } + } + + // Load program source + if (!detail::load_source(cuda_source, *program_sources, "", *include_paths, + file_callback)) { + throw std::runtime_error("Source not found: " + cuda_source); + } + *program_name = program_sources->begin()->first; + + // Maps header include names to their full file paths. + std::map header_fullpaths; + + // Load header sources + for (std::string const& header : headers) { + if (!detail::load_source(header, *program_sources, "", *include_paths, + file_callback, &header_fullpaths)) { + // **TODO: Deal with source not found + throw std::runtime_error("Source not found: " + header); + } + } + +#if JITIFY_PRINT_SOURCE + std::string& program_source = (*program_sources)[*program_name]; + std::cout << "---------------------------------------" << std::endl; + std::cout << "--- Source of " << *program_name << " ---" << std::endl; + std::cout << "---------------------------------------" << std::endl; + detail::print_with_line_numbers(program_source); + std::cout << "---------------------------------------" << std::endl; +#endif + + std::vector compiler_options, linker_files, linker_paths; + detail::split_compiler_and_linker_options(*program_options, &compiler_options, + &linker_files, &linker_paths); + + // If no arch is specified at this point we use whatever the current + // context is. This ensures we pick up the correct internal headers + // for arch-dependent compilation, e.g., some intrinsics are only + // present for specific architectures. + detail::detect_and_add_cuda_arch(compiler_options); + detail::detect_and_add_cxx11_flag(compiler_options); + + // Iteratively try to compile the sources, and use the resulting errors to + // identify missing headers. + std::string log; + nvrtcResult ret; + while ((ret = detail::compile_kernel(*program_name, *program_sources, + compiler_options, "", &log)) == + NVRTC_ERROR_COMPILATION) { + std::string include_name; + std::string include_parent; + int line_num = 0; + if (!detail::extract_include_info_from_compile_error( + log, include_name, include_parent, line_num)) { +#if JITIFY_PRINT_LOG + detail::print_compile_log(*program_name, log); +#endif + // There was a non include-related compilation error + // TODO: How to handle error? + throw std::runtime_error("Runtime compilation failed"); + } + + bool is_included_with_quotes = false; + if (program_sources->count(include_parent)) { + const std::string& parent_source = (*program_sources)[include_parent]; + is_included_with_quotes = + is_include_directive_with_quotes(parent_source, line_num); + } + + // Try to load the new header + // Note: This fullpath lookup is needed because the compiler error + // messages have the include name of the header instead of its full path. + std::string include_parent_fullpath = header_fullpaths[include_parent]; + std::string include_path = detail::path_base(include_parent_fullpath); + if (detail::load_source(include_name, *program_sources, include_path, + *include_paths, file_callback, &header_fullpaths, + is_included_with_quotes)) { +#if JITIFY_PRINT_HEADER_PATHS + std::cout << "Found #include " << include_name << " from " + << include_parent << ":" << line_num << " [" + << include_parent_fullpath << "]" + << " at:\n " << header_fullpaths[include_name] << std::endl; +#endif + } else { // Failed to find header file. + // Comment-out the include line and print a warning + if (!program_sources->count(include_parent)) { + // ***TODO: Unless there's another mechanism (e.g., potentially + // the parent path vs. filename problem), getting + // here means include_parent was found automatically + // in a system include path. + // We need a WAR to zap it from *its parent*. + + typedef std::map source_map; + for (source_map::const_iterator it = program_sources->begin(); + it != program_sources->end(); ++it) { + std::cout << " " << it->first << std::endl; + } + throw std::out_of_range(include_parent + + " not in loaded sources!" + " This may be due to a header being loaded by" + " NVRTC without Jitify's knowledge."); + } + std::string& parent_source = (*program_sources)[include_parent]; + parent_source = detail::comment_out_code_line(line_num, parent_source); +#if JITIFY_PRINT_LOG + std::cout << include_parent << "(" << line_num + << "): warning: " << include_name << ": [jitify] File not found" + << std::endl; +#endif + } + } + if (ret != NVRTC_SUCCESS) { +#if JITIFY_PRINT_LOG + if (ret == NVRTC_ERROR_INVALID_OPTION) { + std::cout << "Compiler options: "; + for (int i = 0; i < (int)compiler_options.size(); ++i) { + std::cout << compiler_options[i] << " "; + } + std::cout << std::endl; + } +#endif + throw std::runtime_error(std::string("NVRTC error: ") + + nvrtcGetErrorString(ret)); + } +} + +inline void instantiate_kernel( + std::string const& program_name, + std::map const& program_sources, + std::string const& instantiation, std::vector const& options, + std::string* log, std::string* ptx, std::string* mangled_instantiation, + std::vector* linker_files, + std::vector* linker_paths) { + std::vector compiler_options; + detail::split_compiler_and_linker_options(options, &compiler_options, + linker_files, linker_paths); + + nvrtcResult ret = + detail::compile_kernel(program_name, program_sources, compiler_options, + instantiation, log, ptx, mangled_instantiation); +#if JITIFY_PRINT_LOG + if (log->size() > 1) { + detail::print_compile_log(program_name, *log); + } +#endif + if (ret != NVRTC_SUCCESS) { + throw std::runtime_error(std::string("NVRTC error: ") + + nvrtcGetErrorString(ret)); + } + +#if JITIFY_PRINT_PTX + std::cout << "---------------------------------------" << std::endl; + std::cout << *mangled_instantiation << std::endl; + std::cout << "---------------------------------------" << std::endl; + std::cout << "--- PTX for " << mangled_instantiation << " in " << program_name + << " ---" << std::endl; + std::cout << "---------------------------------------" << std::endl; + std::cout << *ptx << std::endl; + std::cout << "---------------------------------------" << std::endl; +#endif +} + +inline void get_1d_max_occupancy(CUfunction func, + CUoccupancyB2DSize smem_callback, + unsigned int* smem, int max_block_size, + unsigned int flags, int* grid, int* block) { + if (!func) { + throw std::runtime_error( + "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE " + "1"); + } + CUresult res = cuOccupancyMaxPotentialBlockSizeWithFlags( + grid, block, func, smem_callback, *smem, max_block_size, flags); + if (res != CUDA_SUCCESS) { + const char* msg; + cuGetErrorName(res, &msg); + throw std::runtime_error(msg); + } + if (smem_callback) { + *smem = (unsigned int)smem_callback(*block); + } +} + +} // namespace detail + +//! \endcond + +class KernelInstantiation; +class Kernel; +class Program; +class JitCache; + +struct ProgramConfig { + std::vector options; + std::vector include_paths; + std::string name; + typedef std::map source_map; + source_map sources; +}; + +class JitCache_impl { + friend class Program_impl; + friend class KernelInstantiation_impl; + friend class KernelLauncher_impl; + typedef uint64_t key_type; + jitify::ObjectCache _kernel_cache; + jitify::ObjectCache _program_config_cache; + std::vector _options; +#if JITIFY_THREAD_SAFE + std::mutex _kernel_cache_mutex; + std::mutex _program_cache_mutex; +#endif + public: + inline JitCache_impl(size_t cache_size) + : _kernel_cache(cache_size), _program_config_cache(cache_size) { + detail::add_options_from_env(_options); + + // Bootstrap the cuda context to avoid errors + cudaFree(0); + } +}; + +class Program_impl { + // A friendly class + friend class Kernel_impl; + friend class KernelLauncher_impl; + friend class KernelInstantiation_impl; + // TODO: This can become invalid if JitCache is destroyed before the + // Program object is. However, this can't happen if JitCache + // instances are static. + JitCache_impl& _cache; + uint64_t _hash; + ProgramConfig* _config; + void load_sources(std::string source, std::vector headers, + std::vector options, + file_callback_type file_callback); + + public: + inline Program_impl(JitCache_impl& cache, std::string source, + jitify::detail::vector headers = 0, + jitify::detail::vector options = 0, + file_callback_type file_callback = 0); + inline Program_impl(Program_impl const&) = default; + inline Program_impl(Program_impl&&) = default; + inline std::vector const& options() const { + return _config->options; + } + inline std::string const& name() const { return _config->name; } + inline ProgramConfig::source_map const& sources() const { + return _config->sources; + } + inline std::vector const& include_paths() const { + return _config->include_paths; + } +}; + +class Kernel_impl { + friend class KernelLauncher_impl; + friend class KernelInstantiation_impl; + Program_impl _program; + std::string _name; + std::vector _options; + uint64_t _hash; + + public: + inline Kernel_impl(Program_impl const& program, std::string name, + jitify::detail::vector options = 0); + inline Kernel_impl(Kernel_impl const&) = default; + inline Kernel_impl(Kernel_impl&&) = default; +}; + +class KernelInstantiation_impl { + friend class KernelLauncher_impl; + Kernel_impl _kernel; + uint64_t _hash; + std::string _template_inst; + std::vector _options; + detail::CUDAKernel* _cuda_kernel; + inline void print() const; + void build_kernel(); + + public: + inline KernelInstantiation_impl( + Kernel_impl const& kernel, std::vector const& template_args); + inline KernelInstantiation_impl(KernelInstantiation_impl const&) = default; + inline KernelInstantiation_impl(KernelInstantiation_impl&&) = default; + detail::CUDAKernel const& cuda_kernel() const { return *_cuda_kernel; } +}; + +class KernelLauncher_impl { + KernelInstantiation_impl _kernel_inst; + dim3 _grid; + dim3 _block; + unsigned int _smem; + cudaStream_t _stream; + + public: + inline KernelLauncher_impl(KernelInstantiation_impl const& kernel_inst, + dim3 grid, dim3 block, unsigned int smem = 0, + cudaStream_t stream = 0) + : _kernel_inst(kernel_inst), + _grid(grid), + _block(block), + _smem(smem), + _stream(stream) {} + inline KernelLauncher_impl(KernelLauncher_impl const&) = default; + inline KernelLauncher_impl(KernelLauncher_impl&&) = default; + inline CUresult launch( + jitify::detail::vector arg_ptrs, + jitify::detail::vector arg_types = 0) const; +}; + +/*! An object representing a configured and instantiated kernel ready + * for launching. + */ +class KernelLauncher { + std::unique_ptr _impl; + + public: + inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid, + dim3 block, unsigned int smem = 0, + cudaStream_t stream = 0); + + // Note: It's important that there is no implicit conversion required + // for arg_ptrs, because otherwise the parameter pack version + // below gets called instead (probably resulting in a segfault). + /*! Launch the kernel. + * + * \param arg_ptrs A vector of pointers to each function argument for the + * kernel. + * \param arg_types A vector of function argument types represented + * as code-strings. This parameter is optional and is only used to print + * out the function signature. + */ + inline CUresult launch( + std::vector arg_ptrs = std::vector(), + jitify::detail::vector arg_types = 0) const { + return _impl->launch(arg_ptrs, arg_types); + } + // Regular function call syntax + /*! Launch the kernel. + * + * \see launch + */ + template + inline CUresult operator()(ArgTypes... args) const { + return this->launch(args...); + } + /*! Launch the kernel. + * + * \param args Function arguments for the kernel. + */ + template + inline CUresult launch(ArgTypes... args) const { + return this->launch(std::vector({(void*)&args...}), + {reflection::reflect()...}); + } +}; + +/*! An object representing a kernel instantiation made up of a Kernel and + * template arguments. + */ +class KernelInstantiation { + friend class KernelLauncher; + std::unique_ptr _impl; + + public: + inline KernelInstantiation(Kernel const& kernel, + std::vector const& template_args); + + /*! Implicit conversion to the underlying CUfunction object. + * + * \note This allows use of CUDA APIs like + * cuOccupancyMaxActiveBlocksPerMultiprocessor. + */ + inline operator CUfunction() const { return _impl->cuda_kernel(); } + + /*! Configure the kernel launch. + * + * \see configure + */ + inline KernelLauncher operator()(dim3 grid, dim3 block, unsigned int smem = 0, + cudaStream_t stream = 0) const { + return this->configure(grid, block, smem, stream); + } + /*! Configure the kernel launch. + * + * \param grid The thread grid dimensions for the launch. + * \param block The thread block dimensions for the launch. + * \param smem The amount of shared memory to dynamically allocate, in + * bytes. + * \param stream The CUDA stream to launch the kernel in. + */ + inline KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0, + cudaStream_t stream = 0) const { + return KernelLauncher(*this, grid, block, smem, stream); + } + /*! Configure the kernel launch with a 1-dimensional block and grid chosen + * automatically to maximise occupancy. + * + * \param max_block_size The upper limit on the block size, or 0 for no + * limit. + * \param smem The amount of shared memory to dynamically allocate, in bytes. + * \param smem_callback A function returning smem for a given block size (overrides \p smem). + * \param stream The CUDA stream to launch the kernel in. + * \param flags The flags to pass to cuOccupancyMaxPotentialBlockSizeWithFlags. + */ + inline KernelLauncher configure_1d_max_occupancy( + int max_block_size = 0, unsigned int smem = 0, + CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0, + unsigned int flags = 0) const { + int grid; + int block; + CUfunction func = _impl->cuda_kernel(); + detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size, + flags, &grid, &block); + return this->configure(grid, block, smem, stream); + } + + /* + * \deprecated Use \p get_global_ptr instead. + */ + inline CUdeviceptr get_constant_ptr(const char* name, + size_t* size = nullptr) const { + return get_global_ptr(name, size); + } + + /* + * Get a device pointer to a global __constant__ or __device__ variable using + * its un-mangled name. If provided, *size is set to the size of the variable + * in bytes. + */ + inline CUdeviceptr get_global_ptr(const char* name, + size_t* size = nullptr) const { + return _impl->cuda_kernel().get_global_ptr(name, size); + } + + /* + * Copy data from a global __constant__ or __device__ array to the host using + * its un-mangled name. + */ + template + inline CUresult get_global_array(const char* name, T* data, size_t count, + CUstream stream = 0) const { + return _impl->cuda_kernel().get_global_data(name, data, count, stream); + } + + /* + * Copy a value from a global __constant__ or __device__ variable to the host + * using its un-mangled name. + */ + template + inline CUresult get_global_value(const char* name, T* value, + CUstream stream = 0) const { + return get_global_array(name, value, 1, stream); + } + + /* + * Copy data from the host to a global __constant__ or __device__ array using + * its un-mangled name. + */ + template + inline CUresult set_global_array(const char* name, const T* data, + size_t count, CUstream stream = 0) const { + return _impl->cuda_kernel().set_global_data(name, data, count, stream); + } + + /* + * Copy a value from the host to a global __constant__ or __device__ variable + * using its un-mangled name. + */ + template + inline CUresult set_global_value(const char* name, const T& value, + CUstream stream = 0) const { + return set_global_array(name, &value, 1, stream); + } + + const std::string& mangled_name() const { + return _impl->cuda_kernel().function_name(); + } + + const std::string& ptx() const { return _impl->cuda_kernel().ptx(); } + + const std::vector& link_files() const { + return _impl->cuda_kernel().link_files(); + } + + const std::vector& link_paths() const { + return _impl->cuda_kernel().link_paths(); + } +}; + +/*! An object representing a kernel made up of a Program, a name and options. + */ +class Kernel { + friend class KernelInstantiation; + std::unique_ptr _impl; + + public: + Kernel(Program const& program, std::string name, + jitify::detail::vector options = 0); + + /*! Instantiate the kernel. + * + * \param template_args A vector of template arguments represented as + * code-strings. These can be generated using + * \code{.cpp}jitify::reflection::reflect()\endcode or + * \code{.cpp}jitify::reflection::reflect(value)\endcode + * + * \note Template type deduction is not possible, so all types must be + * explicitly specified. + */ + // inline KernelInstantiation instantiate(std::vector const& + // template_args) const { + inline KernelInstantiation instantiate( + std::vector const& template_args = + std::vector()) const { + return KernelInstantiation(*this, template_args); + } + + // Regular template instantiation syntax (note limited flexibility) + /*! Instantiate the kernel. + * + * \note The template arguments specified on this function are + * used to instantiate the kernel. Non-type template arguments must + * be wrapped with + * \code{.cpp}jitify::reflection::NonType\endcode + * + * \note Template type deduction is not possible, so all types must be + * explicitly specified. + */ + template + inline KernelInstantiation instantiate() const { + return this->instantiate( + std::vector({reflection::reflect()...})); + } + // Template-like instantiation syntax + // E.g., instantiate(myvar,Type())(grid,block) + /*! Instantiate the kernel. + * + * \param targs The template arguments for the kernel, represented as + * values. Types must be wrapped with + * \code{.cpp}jitify::reflection::Type()\endcode or + * \code{.cpp}jitify::reflection::type_of(value)\endcode + * + * \note Template type deduction is not possible, so all types must be + * explicitly specified. + */ + template + inline KernelInstantiation instantiate(TemplateArgs... targs) const { + return this->instantiate( + std::vector({reflection::reflect(targs)...})); + } +}; + +/*! An object representing a program made up of source code, headers + * and options. + */ +class Program { + friend class Kernel; + std::unique_ptr _impl; + + public: + Program(JitCache& cache, std::string source, + jitify::detail::vector headers = 0, + jitify::detail::vector options = 0, + file_callback_type file_callback = 0); + + /*! Select a kernel. + * + * \param name The name of the kernel (unmangled and without + * template arguments). + * \param options A vector of options to be passed to the NVRTC + * compiler when compiling this kernel. + */ + inline Kernel kernel(std::string name, + jitify::detail::vector options = 0) const { + return Kernel(*this, name, options); + } + /*! Select a kernel. + * + * \see kernel + */ + inline Kernel operator()( + std::string name, jitify::detail::vector options = 0) const { + return this->kernel(name, options); + } +}; + +/*! An object that manages a cache of JIT-compiled CUDA kernels. + * + */ +class JitCache { + friend class Program; + std::unique_ptr _impl; + + public: + /*! JitCache constructor. + * \param cache_size The number of kernels to hold in the cache + * before overwriting the least-recently-used ones. + */ + enum { DEFAULT_CACHE_SIZE = 128 }; + JitCache(size_t cache_size = DEFAULT_CACHE_SIZE) + : _impl(new JitCache_impl(cache_size)) {} + + /*! Create a program. + * + * \param source A string containing either the source filename or + * the source itself; in the latter case, the first line must be + * the name of the program. + * \param headers A vector of strings representing the source of + * each header file required by the program. Each entry can be + * either the header filename or the header source itself; in + * the latter case, the first line must be the name of the header + * (i.e., the name by which the header is #included). + * \param options A vector of options to be passed to the + * NVRTC compiler. Include paths specified with \p -I + * are added to the search paths used by Jitify. The environment + * variable JITIFY_OPTIONS can also be used to define additional + * options. + * \param file_callback A pointer to a callback function that is + * invoked whenever a source file needs to be loaded. Inside this + * function, the user can either load/specify the source themselves + * or defer to Jitify's file-loading mechanisms. + * \note Program or header source files referenced by filename are + * looked-up using the following mechanisms (in this order): + * \note 1) By calling file_callback. + * \note 2) By looking for the file embedded in the executable via the GCC + * linker. + * \note 3) By looking for the file in the filesystem. + * + * \note Jitify recursively scans all source files for \p #include + * directives and automatically adds them to the set of headers needed + * by the program. + * If a \p #include directive references a header that cannot be found, + * the directive is automatically removed from the source code to prevent + * immediate compilation failure. This may result in compilation errors + * if the header was required by the program. + * + * \note Jitify automatically includes NVRTC-safe versions of some + * standard library headers. + */ + inline Program program(std::string source, + jitify::detail::vector headers = 0, + jitify::detail::vector options = 0, + file_callback_type file_callback = 0) { + return Program(*this, source, headers, options, file_callback); + } +}; + +inline Program::Program(JitCache& cache, std::string source, + jitify::detail::vector headers, + jitify::detail::vector options, + file_callback_type file_callback) + : _impl(new Program_impl(*cache._impl, source, headers, options, + file_callback)) {} + +inline Kernel::Kernel(Program const& program, std::string name, + jitify::detail::vector options) + : _impl(new Kernel_impl(*program._impl, name, options)) {} + +inline KernelInstantiation::KernelInstantiation( + Kernel const& kernel, std::vector const& template_args) + : _impl(new KernelInstantiation_impl(*kernel._impl, template_args)) {} + +inline KernelLauncher::KernelLauncher(KernelInstantiation const& kernel_inst, + dim3 grid, dim3 block, unsigned int smem, + cudaStream_t stream) + : _impl(new KernelLauncher_impl(*kernel_inst._impl, grid, block, smem, + stream)) {} + +inline std::ostream& operator<<(std::ostream& stream, dim3 d) { + if (d.y == 1 && d.z == 1) { + stream << d.x; + } else { + stream << "(" << d.x << "," << d.y << "," << d.z << ")"; + } + return stream; +} + +inline CUresult KernelLauncher_impl::launch( + jitify::detail::vector arg_ptrs, + jitify::detail::vector arg_types) const { +#if JITIFY_PRINT_LAUNCH + Kernel_impl const& kernel = _kernel_inst._kernel; + std::string arg_types_string = + (arg_types.empty() ? "..." : reflection::reflect_list(arg_types)); + std::cout << "Launching " << kernel._name << _kernel_inst._template_inst + << "<<<" << _grid << "," << _block << "," << _smem << "," << _stream + << ">>>" + << "(" << arg_types_string << ")" << std::endl; +#endif + if (!_kernel_inst._cuda_kernel) { + throw std::runtime_error( + "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE 1"); + } + return _kernel_inst._cuda_kernel->launch(_grid, _block, _smem, _stream, + arg_ptrs); +} + +inline KernelInstantiation_impl::KernelInstantiation_impl( + Kernel_impl const& kernel, std::vector const& template_args) + : _kernel(kernel), _options(kernel._options) { + _template_inst = + (template_args.empty() ? "" + : reflection::reflect_template(template_args)); + using detail::hash_combine; + using detail::hash_larson64; + _hash = _kernel._hash; + _hash = hash_combine(_hash, hash_larson64(_template_inst.c_str())); + JitCache_impl& cache = _kernel._program._cache; + uint64_t cache_key = _hash; +#if JITIFY_THREAD_SAFE + std::lock_guard lock(cache._kernel_cache_mutex); +#endif + if (cache._kernel_cache.contains(cache_key)) { +#if JITIFY_PRINT_INSTANTIATION + std::cout << "Found "; + this->print(); +#endif + _cuda_kernel = &cache._kernel_cache.get(cache_key); + } else { +#if JITIFY_PRINT_INSTANTIATION + std::cout << "Building "; + this->print(); +#endif + _cuda_kernel = &cache._kernel_cache.emplace(cache_key); + this->build_kernel(); + } +} + +inline void KernelInstantiation_impl::print() const { + std::string options_string = reflection::reflect_list(_options); + std::cout << _kernel._name << _template_inst << " [" << options_string << "]" + << std::endl; +} + +inline void KernelInstantiation_impl::build_kernel() { + Program_impl const& program = _kernel._program; + + std::string instantiation = _kernel._name + _template_inst; + + std::string log, ptx, mangled_instantiation; + std::vector linker_files, linker_paths; + detail::instantiate_kernel(program.name(), program.sources(), instantiation, + _options, &log, &ptx, &mangled_instantiation, + &linker_files, &linker_paths); + + _cuda_kernel->set(mangled_instantiation.c_str(), ptx.c_str(), linker_files, + linker_paths); +} + +Kernel_impl::Kernel_impl(Program_impl const& program, std::string name, + jitify::detail::vector options) + : _program(program), _name(name), _options(options) { + // Merge options from parent + _options.insert(_options.end(), _program.options().begin(), + _program.options().end()); + detail::detect_and_add_cuda_arch(_options); + detail::detect_and_add_cxx11_flag(_options); + std::string options_string = reflection::reflect_list(_options); + using detail::hash_combine; + using detail::hash_larson64; + _hash = _program._hash; + _hash = hash_combine(_hash, hash_larson64(_name.c_str())); + _hash = hash_combine(_hash, hash_larson64(options_string.c_str())); +} + +Program_impl::Program_impl(JitCache_impl& cache, std::string source, + jitify::detail::vector headers, + jitify::detail::vector options, + file_callback_type file_callback) + : _cache(cache) { + // Compute hash of source, headers and options + std::string options_string = reflection::reflect_list(options); + using detail::hash_combine; + using detail::hash_larson64; + _hash = hash_combine(hash_larson64(source.c_str()), + hash_larson64(options_string.c_str())); + for (size_t i = 0; i < headers.size(); ++i) { + _hash = hash_combine(_hash, hash_larson64(headers[i].c_str())); + } + _hash = hash_combine(_hash, (uint64_t)file_callback); + // Add pre-include built-in JIT-safe headers + for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) { + const char* hdr_name = detail::preinclude_jitsafe_header_names[i]; + const std::string& hdr_source = + detail::get_jitsafe_headers_map().at(hdr_name); + headers.push_back(std::string(hdr_name) + "\n" + hdr_source); + } + // Merge options from parent + options.insert(options.end(), _cache._options.begin(), _cache._options.end()); + // Load sources +#if JITIFY_THREAD_SAFE + std::lock_guard lock(cache._program_cache_mutex); +#endif + if (!cache._program_config_cache.contains(_hash)) { + _config = &cache._program_config_cache.insert(_hash); + this->load_sources(source, headers, options, file_callback); + } else { + _config = &cache._program_config_cache.get(_hash); + } +} + +inline void Program_impl::load_sources(std::string source, + std::vector headers, + std::vector options, + file_callback_type file_callback) { + _config->options = options; + detail::load_program(source, headers, file_callback, &_config->include_paths, + &_config->sources, &_config->options, &_config->name); +} + +enum Location { HOST, DEVICE }; + +/*! Specifies location and parameters for execution of an algorithm. + * \param stream The CUDA stream on which to execute. + * \param headers A vector of headers to include in the code. + * \param options Options to pass to the NVRTC compiler. + * \param file_callback See jitify::Program. + * \param block_size The size of the CUDA thread block with which to + * execute. + * \param cache_size The number of kernels to store in the cache + * before overwriting the least-recently-used ones. + */ +struct ExecutionPolicy { + /*! Location (HOST or DEVICE) on which to execute.*/ + Location location; + /*! List of headers to include when compiling the algorithm.*/ + std::vector headers; + /*! List of compiler options.*/ + std::vector options; + /*! Optional callback for loading source files.*/ + file_callback_type file_callback; + /*! CUDA stream on which to execute.*/ + cudaStream_t stream; + /*! CUDA device on which to execute.*/ + int device; + /*! CUDA block size with which to execute.*/ + int block_size; + /*! The number of instantiations to store in the cache before overwriting + * the least-recently-used ones.*/ + size_t cache_size; + ExecutionPolicy(Location location_ = DEVICE, + jitify::detail::vector headers_ = 0, + jitify::detail::vector options_ = 0, + file_callback_type file_callback_ = 0, + cudaStream_t stream_ = 0, int device_ = 0, + int block_size_ = 256, + size_t cache_size_ = JitCache::DEFAULT_CACHE_SIZE) + : location(location_), + headers(headers_), + options(options_), + file_callback(file_callback_), + stream(stream_), + device(device_), + block_size(block_size_), + cache_size(cache_size_) {} +}; + +template +class Lambda; + +/*! An object that captures a set of variables for use in a parallel_for + * expression. See JITIFY_CAPTURE(). + */ +class Capture { + public: + std::vector _arg_decls; + std::vector _arg_ptrs; + + public: + template + inline Capture(std::vector arg_names, Args const&... args) + : _arg_ptrs{(void*)&args...} { + std::vector arg_types = {reflection::reflect()...}; + _arg_decls.resize(arg_names.size()); + for (int i = 0; i < (int)arg_names.size(); ++i) { + _arg_decls[i] = arg_types[i] + " " + arg_names[i]; + } + } +}; + +/*! An object that captures the instantiated Lambda function for use + in a parallel_for expression and the function string for NVRTC + compilation + */ +template +class Lambda { + public: + Capture _capture; + std::string _func_string; + Func _func; + + public: + inline Lambda(Capture const& capture, std::string func_string, Func func) + : _capture(capture), _func_string(func_string), _func(func) {} +}; + +template +inline Lambda make_Lambda(Capture const& capture, std::string func, + T lambda) { + return Lambda(capture, func, lambda); +} + +#define JITIFY_CAPTURE(...) \ + jitify::Capture(jitify::detail::split_string(#__VA_ARGS__, -1, ","), \ + __VA_ARGS__) + +#define JITIFY_MAKE_LAMBDA(capture, x, ...) \ + jitify::make_Lambda(capture, std::string(#__VA_ARGS__), \ + [x](int i) { __VA_ARGS__; }) + +#define JITIFY_ARGS(...) __VA_ARGS__ + +#define JITIFY_LAMBDA_(x, ...) \ + JITIFY_MAKE_LAMBDA(JITIFY_CAPTURE(x), JITIFY_ARGS(x), __VA_ARGS__) + +// macro sequence to strip surrounding brackets +#define JITIFY_STRIP_PARENS(X) X +#define JITIFY_PASS_PARAMETERS(X) JITIFY_STRIP_PARENS(JITIFY_ARGS X) + +/*! Creates a Lambda object with captured variables and a function + * definition. + * \param capture A bracket-enclosed list of variables to capture. + * \param ... The function definition. + * + * \code{.cpp} + * float* capture_me; + * int capture_me_too; + * auto my_lambda = JITIFY_LAMBDA( (capture_me, capture_me_too), + * capture_me[i] = i*capture_me_too ); + * \endcode + */ +#define JITIFY_LAMBDA(capture, ...) \ + JITIFY_LAMBDA_(JITIFY_ARGS(JITIFY_PASS_PARAMETERS(capture)), \ + JITIFY_ARGS(__VA_ARGS__)) + +// TODO: Try to implement for_each that accepts iterators instead of indices +// Add compile guard for NOCUDA compilation +/*! Call a function for a range of indices + * + * \param policy Determines the location and device parameters for + * execution of the parallel_for. + * \param begin The starting index. + * \param end The ending index. + * \param lambda A Lambda object created using the JITIFY_LAMBDA() macro. + * + * \code{.cpp} + * char const* in; + * float* out; + * parallel_for(0, 100, JITIFY_LAMBDA( (in, out), {char x = in[i]; out[i] = + * x*x; } ); \endcode + */ +template +CUresult parallel_for(ExecutionPolicy policy, IndexType begin, IndexType end, + Lambda const& lambda) { + using namespace jitify; + + if (policy.location == HOST) { +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (IndexType i = begin; i < end; i++) { + lambda._func(i); + } + return CUDA_SUCCESS; // FIXME - replace with non-CUDA enum type? + } + + thread_local static JitCache kernel_cache(policy.cache_size); + + std::vector arg_decls; + arg_decls.push_back("I begin, I end"); + arg_decls.insert(arg_decls.end(), lambda._capture._arg_decls.begin(), + lambda._capture._arg_decls.end()); + + std::stringstream source_ss; + source_ss << "parallel_for_program\n"; + for (auto const& header : policy.headers) { + std::string header_name = header.substr(0, header.find("\n")); + source_ss << "#include <" << header_name << ">\n"; + } + source_ss << "template\n" + "__global__\n" + "void parallel_for_kernel(" + << reflection::reflect_list(arg_decls) + << ") {\n" + " I i0 = threadIdx.x + blockDim.x*blockIdx.x;\n" + " for( I i=i0+begin; i arg_ptrs; + arg_ptrs.push_back(&begin); + arg_ptrs.push_back(&end); + arg_ptrs.insert(arg_ptrs.end(), lambda._capture._arg_ptrs.begin(), + lambda._capture._arg_ptrs.end()); + + size_t n = end - begin; + dim3 block(policy.block_size); + dim3 grid((unsigned int)std::min((n - 1) / block.x + 1, size_t(65535))); + cudaSetDevice(policy.device); + return program.kernel("parallel_for_kernel") + .instantiate() + .configure(grid, block, 0, policy.stream) + .launch(arg_ptrs); +} + +namespace experimental { + +using jitify::file_callback_type; + +namespace serialization { + +namespace detail { + +// This should be incremented whenever the serialization format changes in any +// incompatible way. +static constexpr const size_t kSerializationVersion = 1; + +inline void serialize(std::ostream& stream, size_t u) { + uint64_t u64 = u; + stream.write(reinterpret_cast(&u64), sizeof(u64)); +} + +inline bool deserialize(std::istream& stream, size_t* size) { + uint64_t u64; + stream.read(reinterpret_cast(&u64), sizeof(u64)); + *size = u64; + return stream.good(); +} + +inline void serialize(std::ostream& stream, std::string const& s) { + serialize(stream, s.size()); + stream.write(s.data(), s.size()); +} + +inline bool deserialize(std::istream& stream, std::string* s) { + size_t size; + if (!deserialize(stream, &size)) return false; + s->resize(size); + if (s->size()) { + stream.read(&(*s)[0], s->size()); + } + return stream.good(); +} + +inline void serialize(std::ostream& stream, std::vector const& v) { + serialize(stream, v.size()); + for (auto const& s : v) { + serialize(stream, s); + } +} + +inline bool deserialize(std::istream& stream, std::vector* v) { + size_t size; + if (!deserialize(stream, &size)) return false; + v->resize(size); + for (auto& s : *v) { + if (!deserialize(stream, &s)) return false; + } + return true; +} + +inline void serialize(std::ostream& stream, + std::map const& m) { + serialize(stream, m.size()); + for (auto const& kv : m) { + serialize(stream, kv.first); + serialize(stream, kv.second); + } +} + +inline bool deserialize(std::istream& stream, + std::map* m) { + size_t size; + if (!deserialize(stream, &size)) return false; + for (size_t i = 0; i < size; ++i) { + std::string key; + if (!deserialize(stream, &key)) return false; + if (!deserialize(stream, &(*m)[key])) return false; + } + return true; +} + +template +inline void serialize(std::ostream& stream, T const& value, Rest... rest) { + serialize(stream, value); + serialize(stream, rest...); +} + +template +inline bool deserialize(std::istream& stream, T* value, Rest... rest) { + if (!deserialize(stream, value)) return false; + return deserialize(stream, rest...); +} + +inline void serialize_magic_number(std::ostream& stream) { + stream.write("JTFY", 4); + serialize(stream, kSerializationVersion); +} + +inline bool deserialize_magic_number(std::istream& stream) { + char magic_number[4] = {0, 0, 0, 0}; + stream.read(&magic_number[0], 4); + if (!(magic_number[0] == 'J' && magic_number[1] == 'T' && + magic_number[2] == 'F' && magic_number[3] == 'Y')) { + return false; + } + size_t serialization_version; + if (!deserialize(stream, &serialization_version)) return false; + return serialization_version == kSerializationVersion; +} + +} // namespace detail + +template +inline std::string serialize(Values const&... values) { + std::ostringstream ss(std::stringstream::out | std::stringstream::binary); + detail::serialize_magic_number(ss); + detail::serialize(ss, values...); + return ss.str(); +} + +template +inline bool deserialize(std::string const& serialized, Values*... values) { + std::istringstream ss(serialized, + std::stringstream::in | std::stringstream::binary); + if (!detail::deserialize_magic_number(ss)) return false; + return detail::deserialize(ss, values...); +} + +} // namespace serialization + +class Program; +class Kernel; +class KernelInstantiation; +class KernelLauncher; + +/*! An object representing a program made up of source code, headers + * and options. + */ +class Program { + private: + friend class KernelInstantiation; + std::string _name; + std::vector _options; + std::map _sources; + + // Private constructor used by deserialize() + Program() {} + + public: + /*! Create a program. + * + * \param source A string containing either the source filename or + * the source itself; in the latter case, the first line must be + * the name of the program. + * \param headers A vector of strings representing the source of + * each header file required by the program. Each entry can be + * either the header filename or the header source itself; in + * the latter case, the first line must be the name of the header + * (i.e., the name by which the header is #included). + * \param options A vector of options to be passed to the + * NVRTC compiler. Include paths specified with \p -I + * are added to the search paths used by Jitify. The environment + * variable JITIFY_OPTIONS can also be used to define additional + * options. + * \param file_callback A pointer to a callback function that is + * invoked whenever a source file needs to be loaded. Inside this + * function, the user can either load/specify the source themselves + * or defer to Jitify's file-loading mechanisms. + * \note Program or header source files referenced by filename are + * looked-up using the following mechanisms (in this order): + * \note 1) By calling file_callback. + * \note 2) By looking for the file embedded in the executable via the GCC + * linker. + * \note 3) By looking for the file in the filesystem. + * + * \note Jitify recursively scans all source files for \p #include + * directives and automatically adds them to the set of headers needed + * by the program. + * If a \p #include directive references a header that cannot be found, + * the directive is automatically removed from the source code to prevent + * immediate compilation failure. This may result in compilation errors + * if the header was required by the program. + * + * \note Jitify automatically includes NVRTC-safe versions of some + * standard library headers. + */ + Program(std::string const& cuda_source, + std::vector const& given_headers = {}, + std::vector const& given_options = {}, + file_callback_type file_callback = nullptr) { + // Add pre-include built-in JIT-safe headers + std::vector headers = given_headers; + for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) { + const char* hdr_name = detail::preinclude_jitsafe_header_names[i]; + const std::string& hdr_source = + detail::get_jitsafe_headers_map().at(hdr_name); + headers.push_back(std::string(hdr_name) + "\n" + hdr_source); + } + + _options = given_options; + detail::add_options_from_env(_options); + std::vector include_paths; + detail::load_program(cuda_source, headers, file_callback, &include_paths, + &_sources, &_options, &_name); + } + + /*! Restore a serialized program. + * + * \param serialized_program The serialized program to restore. + * + * \see serialize + */ + static Program deserialize(std::string const& serialized_program) { + Program program; + if (!serialization::deserialize(serialized_program, &program._name, + &program._options, &program._sources)) { + throw std::runtime_error("Failed to deserialize program"); + } + return program; + } + + /*! Save the program. + * + * \see deserialize + */ + std::string serialize() const { + // Note: Must update kSerializationVersion if this is changed. + return serialization::serialize(_name, _options, _sources); + }; + + /*! Select a kernel. + * + * \param name The name of the kernel (unmangled and without + * template arguments). + * \param options A vector of options to be passed to the NVRTC + * compiler when compiling this kernel. + */ + Kernel kernel(std::string const& name, + std::vector const& options = {}) const; +}; + +class Kernel { + friend class KernelInstantiation; + Program const* _program; + std::string _name; + std::vector _options; + + public: + Kernel(Program const* program, std::string const& name, + std::vector const& options = {}) + : _program(program), _name(name), _options(options) {} + + /*! Instantiate the kernel. + * + * \param template_args A vector of template arguments represented as + * code-strings. These can be generated using + * \code{.cpp}jitify::reflection::reflect()\endcode or + * \code{.cpp}jitify::reflection::reflect(value)\endcode + * + * \note Template type deduction is not possible, so all types must be + * explicitly specified. + */ + KernelInstantiation instantiate( + std::vector const& template_args = + std::vector()) const; + + // Regular template instantiation syntax (note limited flexibility) + /*! Instantiate the kernel. + * + * \note The template arguments specified on this function are + * used to instantiate the kernel. Non-type template arguments must + * be wrapped with + * \code{.cpp}jitify::reflection::NonType\endcode + * + * \note Template type deduction is not possible, so all types must be + * explicitly specified. + */ + template + KernelInstantiation instantiate() const; + + // Template-like instantiation syntax + // E.g., instantiate(myvar,Type())(grid,block) + /*! Instantiate the kernel. + * + * \param targs The template arguments for the kernel, represented as + * values. Types must be wrapped with + * \code{.cpp}jitify::reflection::Type()\endcode or + * \code{.cpp}jitify::reflection::type_of(value)\endcode + * + * \note Template type deduction is not possible, so all types must be + * explicitly specified. + */ + template + KernelInstantiation instantiate(TemplateArgs... targs) const; +}; + +class KernelInstantiation { + friend class KernelLauncher; + std::unique_ptr _cuda_kernel; + + // Private constructor used by deserialize() + KernelInstantiation(std::string const& func_name, std::string const& ptx, + std::vector const& link_files, + std::vector const& link_paths) + : _cuda_kernel(new detail::CUDAKernel(func_name.c_str(), ptx.c_str(), + link_files, link_paths)) {} + + public: + KernelInstantiation(Kernel const& kernel, + std::vector const& template_args) { + Program const* program = kernel._program; + + std::string template_inst = + (template_args.empty() ? "" + : reflection::reflect_template(template_args)); + std::string instantiation = kernel._name + template_inst; + + std::vector options; + options.insert(options.begin(), program->_options.begin(), + program->_options.end()); + options.insert(options.begin(), kernel._options.begin(), + kernel._options.end()); + detail::detect_and_add_cuda_arch(options); + detail::detect_and_add_cxx11_flag(options); + + std::string log, ptx, mangled_instantiation; + std::vector linker_files, linker_paths; + detail::instantiate_kernel(program->_name, program->_sources, instantiation, + options, &log, &ptx, &mangled_instantiation, + &linker_files, &linker_paths); + + _cuda_kernel.reset(new detail::CUDAKernel(mangled_instantiation.c_str(), + ptx.c_str(), linker_files, + linker_paths)); + } + + /*! Implicit conversion to the underlying CUfunction object. + * + * \note This allows use of CUDA APIs like + * cuOccupancyMaxActiveBlocksPerMultiprocessor. + */ + operator CUfunction() const { return *_cuda_kernel; } + + /*! Restore a serialized kernel instantiation. + * + * \param serialized_kernel_inst The serialized kernel instantiation to + * restore. + * + * \see serialize + */ + static KernelInstantiation deserialize( + std::string const& serialized_kernel_inst) { + std::string func_name, ptx; + std::vector link_files, link_paths; + if (!serialization::deserialize(serialized_kernel_inst, &func_name, &ptx, + &link_files, &link_paths)) { + throw std::runtime_error("Failed to deserialize kernel instantiation"); + } + return KernelInstantiation(func_name, ptx, link_files, link_paths); + } + + /*! Save the program. + * + * \see deserialize + */ + std::string serialize() const { + // Note: Must update kSerializationVersion if this is changed. + return serialization::serialize( + _cuda_kernel->function_name(), _cuda_kernel->ptx(), + _cuda_kernel->link_files(), _cuda_kernel->link_paths()); + } + + /*! Configure the kernel launch. + * + * \param grid The thread grid dimensions for the launch. + * \param block The thread block dimensions for the launch. + * \param smem The amount of shared memory to dynamically allocate, in + * bytes. + * \param stream The CUDA stream to launch the kernel in. + */ + KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0, + cudaStream_t stream = 0) const; + + /*! Configure the kernel launch with a 1-dimensional block and grid chosen + * automatically to maximise occupancy. + * + * \param max_block_size The upper limit on the block size, or 0 for no + * limit. + * \param smem The amount of shared memory to dynamically allocate, in bytes. + * \param smem_callback A function returning smem for a given block size + * (overrides \p smem). + * \param stream The CUDA stream to launch the kernel in. + * \param flags The flags to pass to + * cuOccupancyMaxPotentialBlockSizeWithFlags. + */ + KernelLauncher configure_1d_max_occupancy( + int max_block_size = 0, unsigned int smem = 0, + CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0, + unsigned int flags = 0) const; + + /* + * \deprecated Use \p get_global_ptr instead. + */ + CUdeviceptr get_constant_ptr(const char* name, size_t* size = nullptr) const { + return get_global_ptr(name, size); + } + + /* + * Get a device pointer to a global __constant__ or __device__ variable using + * its un-mangled name. If provided, *size is set to the size of the variable + * in bytes. + */ + CUdeviceptr get_global_ptr(const char* name, size_t* size = nullptr) const { + return _cuda_kernel->get_global_ptr(name, size); + } + + /* + * Copy data from a global __constant__ or __device__ array to the host using + * its un-mangled name. + */ + template + CUresult get_global_array(const char* name, T* data, size_t count, + CUstream stream = 0) const { + return _cuda_kernel->get_global_data(name, data, count, stream); + } + + /* + * Copy a value from a global __constant__ or __device__ variable to the host + * using its un-mangled name. + */ + template + CUresult get_global_value(const char* name, T* value, + CUstream stream = 0) const { + return get_global_array(name, value, 1, stream); + } + + /* + * Copy data from the host to a global __constant__ or __device__ array using + * its un-mangled name. + */ + template + CUresult set_global_array(const char* name, const T* data, size_t count, + CUstream stream = 0) const { + return _cuda_kernel->set_global_data(name, data, count, stream); + } + + /* + * Copy a value from the host to a global __constant__ or __device__ variable + * using its un-mangled name. + */ + template + CUresult set_global_value(const char* name, const T& value, + CUstream stream = 0) const { + return set_global_array(name, &value, 1, stream); + } + + const std::string& mangled_name() const { + return _cuda_kernel->function_name(); + } + + const std::string& ptx() const { return _cuda_kernel->ptx(); } + + const std::vector& link_files() const { + return _cuda_kernel->link_files(); + } + + const std::vector& link_paths() const { + return _cuda_kernel->link_paths(); + } +}; + +class KernelLauncher { + KernelInstantiation const* _kernel_inst; + dim3 _grid; + dim3 _block; + unsigned int _smem; + cudaStream_t _stream; + + public: + KernelLauncher(KernelInstantiation const* kernel_inst, dim3 grid, dim3 block, + unsigned int smem = 0, cudaStream_t stream = 0) + : _kernel_inst(kernel_inst), + _grid(grid), + _block(block), + _smem(smem), + _stream(stream) {} + + // Note: It's important that there is no implicit conversion required + // for arg_ptrs, because otherwise the parameter pack version + // below gets called instead (probably resulting in a segfault). + /*! Launch the kernel. + * + * \param arg_ptrs A vector of pointers to each function argument for the + * kernel. + * \param arg_types A vector of function argument types represented + * as code-strings. This parameter is optional and is only used to print + * out the function signature. + */ + CUresult launch(std::vector arg_ptrs = {}, + std::vector arg_types = {}) const { +#if JITIFY_PRINT_LAUNCH + std::string arg_types_string = + (arg_types.empty() ? "..." : reflection::reflect_list(arg_types)); + std::cout << "Launching " << _kernel_inst->_cuda_kernel->function_name() + << "<<<" << _grid << "," << _block << "," << _smem << "," + << _stream << ">>>" + << "(" << arg_types_string << ")" << std::endl; +#endif + return _kernel_inst->_cuda_kernel->launch(_grid, _block, _smem, _stream, + arg_ptrs); + } + + /*! Launch the kernel. + * + * \param args Function arguments for the kernel. + */ + template + CUresult launch(ArgTypes... args) const { + return this->launch(std::vector({(void*)&args...}), + {reflection::reflect()...}); + } +}; + +inline Kernel Program::kernel(std::string const& name, + std::vector const& options) const { + return Kernel(this, name, options); +} + +inline KernelInstantiation Kernel::instantiate( + std::vector const& template_args) const { + return KernelInstantiation(*this, template_args); +} + +template +inline KernelInstantiation Kernel::instantiate() const { + return this->instantiate( + std::vector({reflection::reflect()...})); +} + +template +inline KernelInstantiation Kernel::instantiate(TemplateArgs... targs) const { + return this->instantiate( + std::vector({reflection::reflect(targs)...})); +} + +inline KernelLauncher KernelInstantiation::configure( + dim3 grid, dim3 block, unsigned int smem, cudaStream_t stream) const { + return KernelLauncher(this, grid, block, smem, stream); +} + +inline KernelLauncher KernelInstantiation::configure_1d_max_occupancy( + int max_block_size, unsigned int smem, CUoccupancyB2DSize smem_callback, + cudaStream_t stream, unsigned int flags) const { + int grid; + int block; + CUfunction func = *_cuda_kernel; + detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size, + flags, &grid, &block); + return this->configure(grid, block, smem, stream); +} + +} // namespace experimental + +} // namespace jitify + +#if defined(_WIN32) || defined(_WIN64) +#pragma pop_macro("max") +#pragma pop_macro("min") +#pragma pop_macro("strtok_r") +#endif diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_histogram.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_histogram.cuh new file mode 100644 index 0000000000..37b1ec9734 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_histogram.cuh @@ -0,0 +1,787 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + */ + +#pragma once + +#include + +#include "../util_type.cuh" +#include "../block/block_load.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * + */ +enum BlockHistogramMemoryPreference +{ + GMEM, + SMEM, + BLEND +}; + + +/** + * Parameterizable tuning policy type for AgentHistogram + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming + BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue +struct AgentHistogramPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) + IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming + MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. + int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading samples + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT, ///< Signed integer type for global offsets + int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability +struct AgentHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + /// The pixel type of SampleT + typedef typename CubVector::Type PixelT; + + /// The quad type of SampleT + typedef typename CubVector::Type QuadT; + + /// Constants + enum + { + BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, + + PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, + SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, + QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, + + TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, + TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, + + IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, + + MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? + AgentHistogramPolicyT::MEM_PREFERENCE : + GMEM, + + IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, + }; + + /// Cache load modifier for reading input elements + static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; + + + /// Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + SampleIteratorT>::Type // Directly use the supplied input iterator type + WrappedSampleIteratorT; + + /// Pixel input iterator type (for applying cache modifier) + typedef CacheModifiedInputIterator + WrappedPixelIteratorT; + + /// Qaud input iterator type (for applying cache modifier) + typedef CacheModifiedInputIterator + WrappedQuadIteratorT; + + /// Parameterized BlockLoad type for samples + typedef BlockLoad< + SampleT, + BLOCK_THREADS, + SAMPLES_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadSampleT; + + /// Parameterized BlockLoad type for pixels + typedef BlockLoad< + PixelT, + BLOCK_THREADS, + PIXELS_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadPixelT; + + /// Parameterized BlockLoad type for quads + typedef BlockLoad< + QuadT, + BLOCK_THREADS, + QUADS_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadQuadT; + + /// Shared memory type required by this thread block + struct _TempStorage + { + CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) + + int tile_idx; + + // Aliasable storage layout + union Aliasable + { + typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples + typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels + typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads + + } aliasable; + }; + + + /// Temporary storage type (unionable) + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Sample input iterator (with cache modifier applied, if possible) + WrappedSampleIteratorT d_wrapped_samples; + + /// Native pointer for input samples (possibly NULL if unavailable) + SampleT* d_native_samples; + + /// The number of output bins for each channel + int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; + + /// The number of privatized bins for each channel + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; + + /// Reference to gmem privatized histograms for each channel + CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; + + /// Reference to final output histograms (gmem) + CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; + + /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; + + /// The transform operator for determining privatized counter indices from samples, one for each channel + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; + + /// Whether to prefer privatized smem counters vs privatized global counters + bool prefer_smem; + + + //--------------------------------------------------------------------- + // Initialize privatized bin counters + //--------------------------------------------------------------------- + + // Initialize privatized bin counters + __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) + { + // Initialize histogram bin counts to zeros + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) + { + privatized_histograms[CHANNEL][privatized_bin] = 0; + } + } + + // Barrier to make sure all threads are done updating counters + CTA_SYNC(); + } + + + // Initialize privatized bin counters. Specialized for privatized shared-memory counters + __device__ __forceinline__ void InitSmemBinCounters() + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + InitBinCounters(privatized_histograms); + } + + + // Initialize privatized bin counters. Specialized for privatized global-memory counters + __device__ __forceinline__ void InitGmemBinCounters() + { + InitBinCounters(d_privatized_histograms); + } + + + //--------------------------------------------------------------------- + // Update final output histograms + //--------------------------------------------------------------------- + + // Update final output histograms from privatized histograms + __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) + { + // Barrier to make sure all threads are done updating counters + CTA_SYNC(); + + // Apply privatized bin counts to output bin counts + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_bins = num_privatized_bins[CHANNEL]; + for (int privatized_bin = threadIdx.x; + privatized_bin < channel_bins; + privatized_bin += BLOCK_THREADS) + { + int output_bin = -1; + CounterT count = privatized_histograms[CHANNEL][privatized_bin]; + bool is_valid = count > 0; + + output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); + + if (output_bin >= 0) + { + atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); + } + + } + } + } + + + // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters + __device__ __forceinline__ void StoreSmemOutput() + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + StoreOutput(privatized_histograms); + } + + + // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters + __device__ __forceinline__ void StoreGmemOutput() + { + StoreOutput(d_privatized_histograms); + } + + + //--------------------------------------------------------------------- + // Tile accumulation + //--------------------------------------------------------------------- + + // Accumulate pixels. Specialized for RLE compression. + __device__ __forceinline__ void AccumulatePixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD], + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], + Int2Type is_rle_compress) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + // Bin pixels + int bins[PIXELS_PER_THREAD]; + + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + bins[PIXEL] = -1; + privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); + } + + CounterT accumulator = 1; + + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) + { + if (bins[PIXEL] != bins[PIXEL + 1]) + { + if (bins[PIXEL] >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); + + accumulator = 0; + } + accumulator++; + } + + // Last pixel + if (bins[PIXELS_PER_THREAD - 1] >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); + } + } + + + // Accumulate pixels. Specialized for individual accumulation of each pixel. + __device__ __forceinline__ void AccumulatePixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD], + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], + Int2Type is_rle_compress) + { + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + int bin = -1; + privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); + if (bin >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bin, 1); + } + } + } + + + /** + * Accumulate pixel, specialized for smem privatized histogram + */ + __device__ __forceinline__ void AccumulateSmemPixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD]) + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); + } + + + /** + * Accumulate pixel, specialized for gmem privatized histogram + */ + __device__ __forceinline__ void AccumulateGmemPixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD]) + { + AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); + } + + + + //--------------------------------------------------------------------- + // Tile loading + //--------------------------------------------------------------------- + + // Load full, aligned tile using pixel iterator (multi-channel) + template + __device__ __forceinline__ void LoadFullAlignedTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) + { + typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; + + WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); + + // Load using a wrapped pixel iterator + BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( + d_wrapped_pixels, + reinterpret_cast(samples)); + } + + // Load full, aligned tile using quad iterator (single-channel) + __device__ __forceinline__ void LoadFullAlignedTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type<1> num_active_channels) + { + typedef QuadT AliasedQuads[QUADS_PER_THREAD]; + + WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); + + // Load using a wrapped quad iterator + BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( + d_wrapped_quads, + reinterpret_cast(samples)); + } + + // Load full, aligned tile + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); + } + + // Load full, mis-aligned tile using sample iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; + + // Load using sample iterator + BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( + d_wrapped_samples + block_offset, + reinterpret_cast(samples)); + } + + // Load partially-full, aligned tile using the pixel iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; + + WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); + + int valid_pixels = valid_samples / NUM_CHANNELS; + + // Load using a wrapped pixel iterator + BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( + d_wrapped_pixels, + reinterpret_cast(samples), + valid_pixels); + } + + // Load partially-full, mis-aligned tile using sample iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; + + BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( + d_wrapped_samples + block_offset, + reinterpret_cast(samples), + valid_samples); + } + + + //--------------------------------------------------------------------- + // Tile processing + //--------------------------------------------------------------------- + + // Consume a tile of data samples + template < + bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) + bool IS_FULL_TILE> // Whether the tile is full + __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) + { + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; + bool is_valid[PIXELS_PER_THREAD]; + + // Load tile + LoadTile( + block_offset, + valid_samples, + samples, + Int2Type(), + Int2Type()); + + // Set valid flags + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); + + // Accumulate samples +#if CUB_PTX_ARCH >= 120 + if (prefer_smem) + AccumulateSmemPixels(samples, is_valid); + else + AccumulateGmemPixels(samples, is_valid); +#else + AccumulateGmemPixels(samples, is_valid); +#endif + + } + + + // Consume row tiles. Specialized for work-stealing from queue + template + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue, + Int2Type is_work_stealing) + { + + int num_tiles = num_rows * tiles_per_row; + int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; + OffsetT num_even_share_tiles = gridDim.x * gridDim.y; + + while (tile_idx < num_tiles) + { + int row = tile_idx / tiles_per_row; + int col = tile_idx - (row * tiles_per_row); + OffsetT row_offset = row * row_stride_samples; + OffsetT col_offset = (col * TILE_SAMPLES); + OffsetT tile_offset = row_offset + col_offset; + + if (col == tiles_per_row - 1) + { + // Consume a partially-full tile at the end of the row + OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; + ConsumeTile(tile_offset, num_remaining); + } + else + { + // Consume full tile + ConsumeTile(tile_offset, TILE_SAMPLES); + } + + CTA_SYNC(); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; + + CTA_SYNC(); + + tile_idx = temp_storage.tile_idx; + } + } + + + // Consume row tiles. Specialized for even-share (striped across thread blocks) + template + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue, + Int2Type is_work_stealing) + { + for (int row = blockIdx.y; row < num_rows; row += gridDim.y) + { + OffsetT row_begin = row * row_stride_samples; + OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); + OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); + + while (tile_offset < row_end) + { + OffsetT num_remaining = row_end - tile_offset; + + if (num_remaining < TILE_SAMPLES) + { + // Consume partial tile + ConsumeTile(tile_offset, num_remaining); + break; + } + + // Consume full tile + ConsumeTile(tile_offset, TILE_SAMPLES); + tile_offset += gridDim.x * TILE_SAMPLES; + } + } + } + + + //--------------------------------------------------------------------- + // Parameter extraction + //--------------------------------------------------------------------- + + // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) + template < + CacheLoadModifier _MODIFIER, + typename _ValueT, + typename _OffsetT> + __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) + { + return itr.ptr; + } + + // Return a native pixel pointer (specialized for other types) + template + __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) + { + return NULL; + } + + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + + /** + * Constructor + */ + __device__ __forceinline__ AgentHistogram( + TempStorage &temp_storage, ///< Reference to temp_storage + SampleIteratorT d_samples, ///< Input data to reduce + int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram + CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms + CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel + : + temp_storage(temp_storage.Alias()), + d_wrapped_samples(d_samples), + num_output_bins(num_output_bins), + num_privatized_bins(num_privatized_bins), + d_output_histograms(d_output_histograms), + privatized_decode_op(privatized_decode_op), + output_decode_op(output_decode_op), + d_native_samples(NativePointer(d_wrapped_samples)), + prefer_smem((MEM_PREFERENCE == SMEM) ? + true : // prefer smem privatized histograms + (MEM_PREFERENCE == GMEM) ? + false : // prefer gmem privatized histograms + blockIdx.x & 1) // prefer blended privatized histograms + { + int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; + + // Initialize the locations of this block's privatized histograms + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); + } + + + /** + * Consume image + */ + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks + { + // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) + int quad_mask = AlignBytes::ALIGN_BYTES - 1; + int pixel_mask = AlignBytes::ALIGN_BYTES - 1; + size_t row_bytes = sizeof(SampleT) * row_stride_samples; + + bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel + ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned + ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad + + bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel + ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned + ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel + + // Whether rows are aligned and can be vectorized + if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) + ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); + else + ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); + } + + + /** + * Initialize privatized bin counters. Specialized for privatized shared-memory counters + */ + __device__ __forceinline__ void InitBinCounters() + { + if (prefer_smem) + InitSmemBinCounters(); + else + InitGmemBinCounters(); + } + + + /** + * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters + */ + __device__ __forceinline__ void StoreOutput() + { + if (prefer_smem) + StoreSmemOutput(); + else + StoreGmemOutput(); + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_downsweep.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_downsweep.cuh new file mode 100644 index 0000000000..faea88138e --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_downsweep.cuh @@ -0,0 +1,789 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + */ + + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_radix_rank.cuh" +#include "../block/block_exchange.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Radix ranking algorithm + */ +enum RadixRankAlgorithm +{ + RADIX_RANK_BASIC, + RADIX_RANK_MEMOIZE, + RADIX_RANK_MATCH +}; + +/** + * Parameterizable tuning policy type for AgentRadixSortDownsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) + RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct AgentRadixSortDownsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) + static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + + + + +/** + * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + */ +template < + typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< KeyT type + typename ValueT, ///< ValueT type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRadixSortDownsweep +{ + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + // Appropriate unsigned-bits representation of KeyT + typedef typename Traits::UnsignedBits UnsignedBits; + + static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY; + static const UnsignedBits MAX_KEY = Traits::MAX_KEY; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; + static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; + static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; + static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; + + enum + { + BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, + RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Input iterator wrapper type (for applying cache modifier)s + typedef CacheModifiedInputIterator KeysItr; + typedef CacheModifiedInputIterator ValuesItr; + + // Radix ranking type to use + typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC), + BlockRadixRank, + typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + BlockRadixRank, + BlockRadixRankMatch + >::Type + >::Type BlockRadixRankT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD + }; + + // BlockLoad type (keys) + typedef BlockLoad< + UnsignedBits, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM> BlockLoadKeysT; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM> BlockLoadValuesT; + + // Value exchange array type + typedef ValueT ValueExchangeT[TILE_ITEMS]; + + /** + * Shared memory storage layout + */ + union __align__(16) _TempStorage + { + typename BlockLoadKeysT::TempStorage load_keys; + typename BlockLoadValuesT::TempStorage load_values; + typename BlockRadixRankT::TempStorage radix_rank; + + struct + { + UnsignedBits exchange_keys[TILE_ITEMS]; + OffsetT relative_bin_offsets[RADIX_DIGITS]; + }; + + Uninitialized exchange_values; + + OffsetT exclusive_digit_prefix[RADIX_DIGITS]; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Input and output device pointers + KeysItr d_keys_in; + ValuesItr d_values_in; + UnsignedBits *d_keys_out; + ValueT *d_values_out; + + // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + // Whether to short-cirucit + int short_circuit; + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + + /** + * Scatter ranked keys through shared memory, then to device-accessible memory + */ + template + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT valid_items) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; + UnsignedBits digit = BFE(key, current_bit, num_bits); + relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit]; + + // Un-twiddle + key = Traits::TwiddleOut(key); + + if (FULL_TILE || + (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) + { + d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; + } + } + } + + + /** + * Scatter ranked values through shared memory, then to device-accessible memory + */ + template + __device__ __forceinline__ void ScatterValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT valid_items) + { + CTA_SYNC(); + + ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + exchange_values[ranks[ITEM]] = values[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; + + if (FULL_TILE || + (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) + { + d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; + } + } + } + + /** + * Load a tile of keys (specialized for full tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadKeysT(temp_storage.load_keys).Load( + d_keys_in + block_offset, keys); + + CTA_SYNC(); + } + + + /** + * Load a tile of keys (specialized for partial tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + BlockLoadKeysT(temp_storage.load_keys).Load( + d_keys_in + block_offset, keys, valid_items, oob_item); + + CTA_SYNC(); + } + + + /** + * Load a tile of keys (specialized for full tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); + } + + + /** + * Load a tile of keys (specialized for partial tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); + } + + + /** + * Load a tile of values (specialized for full tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadValuesT(temp_storage.load_values).Load( + d_values_in + block_offset, values); + + CTA_SYNC(); + } + + + /** + * Load a tile of values (specialized for partial tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + BlockLoadValuesT(temp_storage.load_values).Load( + d_values_in + block_offset, values, valid_items); + + CTA_SYNC(); + } + + + /** + * Load a tile of items (specialized for full tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); + } + + + /** + * Load a tile of items (specialized for partial tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); + } + + + /** + * Truck along associated values + */ + template + __device__ __forceinline__ void GatherScatterValues( + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type /*is_keys_only*/) + { + ValueT values[ITEMS_PER_THREAD]; + + CTA_SYNC(); + + LoadValues( + values, + block_offset, + valid_items, + Int2Type(), + Int2Type()); + + ScatterValues( + values, + relative_bin_offsets, + ranks, + valid_items); + } + + + /** + * Truck along associated values (specialized for key-only sorting) + */ + template + __device__ __forceinline__ void GatherScatterValues( + OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD], + int (&/*ranks*/)[ITEMS_PER_THREAD], + OffsetT /*block_offset*/, + OffsetT /*valid_items*/, + Int2Type /*is_keys_only*/) + {} + + + /** + * Process tile + */ + template + __device__ __forceinline__ void ProcessTile( + OffsetT block_offset, + const OffsetT &valid_items = TILE_ITEMS) + { + UnsignedBits keys[ITEMS_PER_THREAD]; + int ranks[ITEMS_PER_THREAD]; + OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; + + // Assign default (min/max) value to all keys + UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY; + + // Load tile of keys + LoadKeys( + keys, + block_offset, + valid_items, + default_key, + Int2Type(), + Int2Type()); + + // Twiddle key bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + keys[KEY] = Traits::TwiddleIn(keys[KEY]); + } + + // Rank the twiddled keys + int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; + BlockRadixRankT(temp_storage.radix_rank).RankKeys( + keys, + ranks, + current_bit, + num_bits, + exclusive_digit_prefix); + + CTA_SYNC(); + + // Share exclusive digit prefix + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + // Store exclusive prefix + temp_storage.exclusive_digit_prefix[bin_idx] = + exclusive_digit_prefix[track]; + } + } + + CTA_SYNC(); + + // Get inclusive digit prefix + int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + { + // Get inclusive digit prefix from exclusive prefix (higher bins come first) + inclusive_digit_prefix[track] = (bin_idx == 0) ? + (BLOCK_THREADS * ITEMS_PER_THREAD) : + temp_storage.exclusive_digit_prefix[bin_idx - 1]; + } + else + { + // Get inclusive digit prefix from exclusive prefix (lower bins come first) + inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? + (BLOCK_THREADS * ITEMS_PER_THREAD) : + temp_storage.exclusive_digit_prefix[bin_idx + 1]; + } + } + } + + CTA_SYNC(); + + // Update global scatter base offsets for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + bin_offset[track] -= exclusive_digit_prefix[track]; + temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track]; + bin_offset[track] += inclusive_digit_prefix[track]; + } + } + + CTA_SYNC(); + + // Scatter keys + ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); + + // Gather/scatter values + GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type()); + } + + //--------------------------------------------------------------------- + // Copy shortcut + //--------------------------------------------------------------------- + + /** + * Copy tiles within the range of input + */ + template < + typename InputIteratorT, + typename T> + __device__ __forceinline__ void Copy( + InputIteratorT d_in, + T *d_out, + OffsetT block_offset, + OffsetT block_end) + { + // Simply copy the input + while (block_offset + TILE_ITEMS <= block_end) + { + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items); + CTA_SYNC(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items); + + block_offset += TILE_ITEMS; + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + OffsetT valid_items = block_end - block_offset; + + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); + CTA_SYNC(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); + } + } + + + /** + * Copy tiles within the range of input (specialized for NullType) + */ + template + __device__ __forceinline__ void Copy( + InputIteratorT /*d_in*/, + NullType * /*d_out*/, + OffsetT /*block_offset*/, + OffsetT /*block_end*/) + {} + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortDownsweep( + TempStorage &temp_storage, + OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], + OffsetT num_items, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_values_in(d_values_in), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(1) + { + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + this->bin_offset[track] = bin_offset[track]; + + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + // Short circuit if the histogram has only bin counts of only zeros or problem-size + short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); + } + } + + short_circuit = CTA_SYNC_AND(short_circuit); + } + + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortDownsweep( + TempStorage &temp_storage, + OffsetT num_items, + OffsetT *d_spine, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_values_in(d_values_in), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(1) + { + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size + OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; + short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); + + // Load my block's bin offset for my bin + bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; + } + } + + short_circuit = CTA_SYNC_AND(short_circuit); + } + + + /** + * Distribute keys from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + OffsetT block_offset, + OffsetT block_end) + { + if (short_circuit) + { + // Copy keys + Copy(d_keys_in, d_keys_out, block_offset, block_end); + + // Copy values + Copy(d_values_in, d_values_out, block_offset, block_end); + } + else + { + // Process full tiles of tile_items + #pragma unroll 1 + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessTile(block_offset); + block_offset += TILE_ITEMS; + + CTA_SYNC(); + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + ProcessTile(block_offset, block_end - block_offset); + } + + } + } + +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_upsweep.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_upsweep.cuh new file mode 100644 index 0000000000..2081cefba9 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_upsweep.cuh @@ -0,0 +1,526 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + */ + +#pragma once + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_load.cuh" +#include "../warp/warp_reduce.cuh" +#include "../block/block_load.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentRadixSortUpsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct AgentRadixSortUpsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + */ +template < + typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type + typename KeyT, ///< KeyT type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRadixSortUpsweep +{ + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + typedef typename Traits::UnsignedBits UnsignedBits; + + // Integer type for digit counters (to be packed into words of PackedCounters) + typedef unsigned char DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef unsigned int PackedCounter; + + static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; + + enum + { + RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, + BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, + KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // To prevent counter overflow, we must periodically unpack and aggregate the + // digit counters back into registers. Each counter lane is assigned to a + // warp for aggregation. + + LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), + + // Unroll tiles in batches without risk of counter overflow + UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), + UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, + }; + + + // Input iterator wrapper type (for applying cache modifier)s + typedef CacheModifiedInputIterator KeysItr; + + /** + * Shared memory storage layout + */ + union __align__(16) _TempStorage + { + DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; + OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields (aggregate state bundle) + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Thread-local counters for periodically aggregating composite-counter lanes + OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; + + // Input and output device pointers + KeysItr d_keys_in; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + + + //--------------------------------------------------------------------- + // Helper structure for templated iteration + //--------------------------------------------------------------------- + + // Iterate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys( + AgentRadixSortUpsweep &cta, + UnsignedBits keys[KEYS_PER_THREAD]) + { + cta.Bucket(keys[COUNT]); + + // Next + Iterate::BucketKeys(cta, keys); + } + }; + + // Terminate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {} + }; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decode a key and increment corresponding smem digit counter + */ + __device__ __forceinline__ void Bucket(UnsignedBits key) + { + // Perform transform op + UnsignedBits converted_key = Traits::TwiddleIn(key); + + // Extract current digit bits + UnsignedBits digit = BFE(converted_key, current_bit, num_bits); + + // Get sub-counter offset + UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); + + // Get row offset + UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; + + // Increment counter + temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; + } + + + /** + * Reset composite counters + */ + __device__ __forceinline__ void ResetDigitCounters() + { + #pragma unroll + for (int LANE = 0; LANE < COUNTER_LANES; LANE++) + { + temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; + } + } + + + /** + * Reset the unpacked counters in each thread + */ + __device__ __forceinline__ void ResetUnpackedCounters() + { + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + local_counts[LANE][UNPACKED_COUNTER] = 0; + } + } + } + + + /** + * Extracts and aggregates the digit counters for each counter lane + * owned by this warp + */ + __device__ __forceinline__ void UnpackDigitCounts() + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + const int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + #pragma unroll + for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; + local_counts[LANE][UNPACKED_COUNTER] += counter; + } + } + } + } + } + + + /** + * Processes a single, full tile + */ + __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset) + { + // Tile of keys + UnsignedBits keys[KEYS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); + + // Prevent hoisting + CTA_SYNC(); + + // Bucket tile of keys + Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); + } + + + /** + * Processes a single load (may have some threads masked off) + */ + __device__ __forceinline__ void ProcessPartialTile( + OffsetT block_offset, + const OffsetT &block_end) + { + // Process partial tile if necessary using single loads + block_offset += threadIdx.x; + while (block_offset < block_end) + { + // Load and bucket key + UnsignedBits key = d_keys_in[block_offset]; + Bucket(key); + block_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortUpsweep( + TempStorage &temp_storage, + const KeyT *d_keys_in, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + current_bit(current_bit), + num_bits(num_bits) + {} + + + /** + * Compute radix digit histograms from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + OffsetT block_offset, + const OffsetT &block_end) + { + // Reset digit counters in smem and unpacked counters in registers + ResetDigitCounters(); + ResetUnpackedCounters(); + + // Unroll batches of full tiles + while (block_offset + UNROLLED_ELEMENTS <= block_end) + { + for (int i = 0; i < UNROLL_COUNT; ++i) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + CTA_SYNC(); + + // Aggregate back into local_count registers to prevent overflow + UnpackDigitCounts(); + + CTA_SYNC(); + + // Reset composite counters in lanes + ResetDigitCounters(); + } + + // Unroll single full tiles + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Process partial tile if necessary + ProcessPartialTile( + block_offset, + block_end); + + CTA_SYNC(); + + // Aggregate back into local_count registers + UnpackDigitCounts(); + } + + + /** + * Extract counts (saving them to the external array) + */ + template + __device__ __forceinline__ void ExtractCounts( + OffsetT *counters, + int bin_stride = 1, + int bin_offset = 0) + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + int bin_idx = digit_row + UNPACKED_COUNTER; + + temp_storage.block_counters[warp_tid][bin_idx] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + CTA_SYNC(); + + // Rake-reduce bin_count reductions + + // Whole blocks + #pragma unroll + for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; + (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; + BIN_BASE += BLOCK_THREADS) + { + int bin_idx = BIN_BASE + threadIdx.x; + + OffsetT bin_count = 0; + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count += temp_storage.block_counters[i][bin_idx]; + + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + counters[(bin_stride * bin_idx) + bin_offset] = bin_count; + } + + // Remainder + if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) + { + int bin_idx = threadIdx.x; + + OffsetT bin_count = 0; + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count += temp_storage.block_counters[i][bin_idx]; + + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + counters[(bin_stride * bin_idx) + bin_offset] = bin_count; + } + } + + + /** + * Extract counts + */ + template + __device__ __forceinline__ void ExtractCounts( + OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + int bin_idx = digit_row + UNPACKED_COUNTER; + + temp_storage.block_counters[warp_tid][bin_idx] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + CTA_SYNC(); + + // Rake-reduce bin_count reductions + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + bin_count[track] = 0; + + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count[track] += temp_storage.block_counters[i][bin_idx]; + } + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_reduce.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_reduce.cuh new file mode 100644 index 0000000000..000a905ccf --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_reduce.cuh @@ -0,0 +1,385 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + */ + +#pragma once + +#include + +#include "../block/block_load.cuh" +#include "../block/block_reduce.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_even_share.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentReduce + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use + CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements +struct AgentReducePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + * + * Each thread reduces only the values it loads. If \p FIRST_TILE, this + * partial reduction is stored into \p thread_aggregate. Otherwise it is + * accumulated into \p thread_aggregate. + */ +template < + typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type + typename InputIteratorT, ///< Random-access iterator type for input + typename OutputIteratorT, ///< Random-access iterator type for output + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) +struct AgentReduce +{ + + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + typedef typename std::iterator_traits::value_type InputT; + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + /// Vector type of InputT for data movement + typedef typename CubVector::Type VectorT; + + /// Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + /// Constants + enum + { + BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type + ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && + (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && + (IsPointer::VALUE) && Traits::PRIMITIVE, + + }; + + static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; + static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; + + /// Parameterized BlockReduce primitive + typedef BlockReduce BlockReduceT; + + /// Shared memory type required by this thread block + struct _TempStorage + { + typename BlockReduceT::TempStorage reduce; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + InputIteratorT d_in; ///< Input data to reduce + WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + + + //--------------------------------------------------------------------- + // Utility + //--------------------------------------------------------------------- + + + // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type /*can_vectorize*/) + { + return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + } + + // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator /*d_in*/, + Int2Type /*can_vectorize*/) + { + return false; + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentReduce( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data to reduce + ReductionOp reduction_op) ///< Binary reduction operator + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_wrapped_in(d_in), + reduction_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Tile consumption + //--------------------------------------------------------------------- + + /** + * Consume a full tile of input (non-vectorized) + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int /*valid_items*/, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + OutputT items[ITEMS_PER_THREAD]; + + // Load items in striped fashion + LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); + + // Reduce items within each thread stripe + thread_aggregate = (IS_FIRST_TILE) ? + internal::ThreadReduce(items, reduction_op) : + internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + + /** + * Consume a full tile of input (vectorized) + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int /*valid_items*/, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + // Alias items as an array of VectorT and load it in striped fashion + enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; + + // Fabricate a vectorized input iterator + InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); + CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in_unqualified)); + + // Load items as vector items + InputT input_items[ITEMS_PER_THREAD]; + VectorT *vec_items = reinterpret_cast(input_items); + #pragma unroll + for (int i = 0; i < WORDS; ++i) + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; + + // Convert from input type to output type + OutputT items[ITEMS_PER_THREAD]; + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + items[i] = input_items[i]; + + // Reduce items within each thread stripe + thread_aggregate = (IS_FIRST_TILE) ? + internal::ThreadReduce(items, reduction_op) : + internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + + /** + * Consume a partial tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int valid_items, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + // Partial tile + int thread_offset = threadIdx.x; + + // Read first item + if ((IS_FIRST_TILE) && (thread_offset < valid_items)) + { + thread_aggregate = d_wrapped_in[block_offset + thread_offset]; + thread_offset += BLOCK_THREADS; + } + + // Continue reading items (block-striped) + while (thread_offset < valid_items) + { + OutputT item = d_wrapped_in[block_offset + thread_offset]; + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------- + // Consume a contiguous segment of tiles + //--------------------------------------------------------------------- + + /** + * \brief Reduce a contiguous segment of input tiles + */ + template + __device__ __forceinline__ OutputT ConsumeRange( + GridEvenShare &even_share, ///< GridEvenShare descriptor + Int2Type can_vectorize) ///< Whether or not we can vectorize loads + { + OutputT thread_aggregate; + + if (even_share.block_offset + TILE_ITEMS > even_share.block_end) + { + // First tile isn't full (not all threads have valid items) + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); + return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); + } + + // At least one full block + ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + even_share.block_offset += even_share.block_stride; + + // Consume subsequent full tiles of input + while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) + { + ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + even_share.block_offset += even_share.block_stride; + } + + // Consume a partially-full tile + if (even_share.block_offset < even_share.block_end) + { + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); + } + + // Compute block-wide reduction (all threads have valid items) + return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); + } + + + /** + * \brief Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ OutputT ConsumeRange( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + { + GridEvenShare even_share; + even_share.template BlockInit(block_offset, block_end); + + return (IsAligned(d_in + block_offset, Int2Type())) ? + ConsumeRange(even_share, Int2Type()) : + ConsumeRange(even_share, Int2Type()); + } + + + /** + * Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ OutputT ConsumeTiles( + GridEvenShare &even_share) ///< [in] GridEvenShare descriptor + { + // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block + even_share.template BlockInit(); + + return (IsAligned(d_in, Int2Type())) ? + ConsumeRange(even_share, Int2Type()) : + ConsumeRange(even_share, Int2Type()); + + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_reduce_by_key.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_reduce_by_key.cuh new file mode 100644 index 0000000000..51964d3e68 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_reduce_by_key.cuh @@ -0,0 +1,547 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_discontinuity.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentReduceByKey + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentReduceByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentReduceByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair OffsetValuePairT; + + // Tuple type for pairing keys and values + typedef KeyValuePair KeyValuePairT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Guarded inequality functor + template + struct GuardedInequalityWrapper + { + _EqualityOpT op; ///< Wrapped equality operator + int num_remaining; ///< Items remaining + + /// Constructor + __host__ __device__ __forceinline__ + GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const + { + if (idx < num_remaining) + return !op(a, b); // In bounds + + // Return true if first out-of-bounds item, false otherwise + return (idx == num_remaining); + } + }; + + + // Constants + enum + { + BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + KeysInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedKeysInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + ValuesInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedValuesInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFixupInputIteratorT; + + // Reduce-value-by-segment scan operator + typedef ReduceBySegmentOp ReduceBySegmentOpT; + + // Parameterized BlockLoad type for keys + typedef BlockLoad< + KeyOutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentReduceByKeyPolicyT::LOAD_ALGORITHM> + BlockLoadKeysT; + + // Parameterized BlockLoad type for values + typedef BlockLoad< + ValueOutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentReduceByKeyPolicyT::LOAD_ALGORITHM> + BlockLoadValuesT; + + // Parameterized BlockDiscontinuity type for keys + typedef BlockDiscontinuity< + KeyOutputT, + BLOCK_THREADS> + BlockDiscontinuityKeys; + + // Parameterized BlockScan type + typedef BlockScan< + OffsetValuePairT, + BLOCK_THREADS, + AgentReduceByKeyPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OffsetValuePairT, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Key and value exchange types + typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; + typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1]; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for loading keys + typename BlockLoadKeysT::TempStorage load_keys; + + // Smem needed for loading values + typename BlockLoadValuesT::TempStorage load_values; + + // Smem needed for compacting key value pairs(allows non POD items in this union) + Uninitialized raw_exchange; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedKeysInputIteratorT d_keys_in; ///< Input keys + UniqueOutputIteratorT d_unique_out; ///< Unique output keys + WrappedValuesInputIteratorT d_values_in; ///< Input values + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + NumRunsOutputIteratorT d_num_runs_out; ///< Output pointer for total number of segments identified + EqualityOpT equality_op; ///< KeyT equality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentReduceByKey( + TempStorage& temp_storage, ///< Reference to temp_storage + KeysInputIteratorT d_keys_in, ///< Input keys + UniqueOutputIteratorT d_unique_out, ///< Unique output keys + ValuesInputIteratorT d_values_in, ///< Input values + AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates + NumRunsOutputIteratorT d_num_runs_out, ///< Output pointer for total number of segments identified + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op) ///< ValueT reduction operator + : + temp_storage(temp_storage.Alias()), + d_keys_in(d_keys_in), + d_unique_out(d_unique_out), + d_values_in(d_values_in), + d_aggregates_out(d_aggregates_out), + d_num_runs_out(d_num_runs_out), + equality_op(equality_op), + reduction_op(reduction_op), + scan_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + /** + * Directly scatter flagged items to output offsets + */ + __device__ __forceinline__ void ScatterDirect( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD]) + { + // Scatter flagged keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (segment_flags[ITEM]) + { + d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; + d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; + } + } + } + + + /** + * 2-phase scatter flagged items to output offsets + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate: the scatter offsets must be decremented for value aggregates + */ + __device__ __forceinline__ void ScatterTwoPhase( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD], + OffsetT num_tile_segments, + OffsetT num_tile_segments_prefix) + { + CTA_SYNC(); + + // Compact and scatter pairs + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (segment_flags[ITEM]) + { + temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) + { + KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; + d_unique_out[num_tile_segments_prefix + item] = pair.key; + d_aggregates_out[num_tile_segments_prefix + item] = pair.value; + } + } + + + /** + * Scatter flagged items + */ + __device__ __forceinline__ void Scatter( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD], + OffsetT num_tile_segments, + OffsetT num_tile_segments_prefix) + { + // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one + if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) + { + ScatterTwoPhase( + scatter_items, + segment_flags, + segment_indices, + num_tile_segments, + num_tile_segments_prefix); + } + else + { + ScatterDirect( + scatter_items, + segment_flags, + segment_indices); + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys + KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up + ValueOutputT values[ITEMS_PER_THREAD]; // Tile values + OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags + OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices + OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices + KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering + + // Load keys + if (IS_LAST_TILE) + BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining); + else + BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + + // Load tile predecessor key in first thread + KeyOutputT tile_predecessor; + if (threadIdx.x == 0) + { + tile_predecessor = (tile_idx == 0) ? + keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head) + d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile + } + + CTA_SYNC(); + + // Load values + if (IS_LAST_TILE) + BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining); + else + BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); + + CTA_SYNC(); + + // Initialize head-flags and shuffle up the previous keys + if (IS_LAST_TILE) + { + // Use custom flag operator to additionally flag the first out-of-bounds item + GuardedInequalityWrapper flag_op(equality_op, num_remaining); + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( + head_flags, keys, prev_keys, flag_op, tile_predecessor); + } + else + { + InequalityWrapper flag_op(equality_op); + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( + head_flags, keys, prev_keys, flag_op, tile_predecessor); + } + + // Zip values and head flags + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + scan_items[ITEM].value = values[ITEM]; + scan_items[ITEM].key = head_flags[ITEM]; + } + + // Perform exclusive tile scan + OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate + OffsetT num_segments_prefix; // Number of segments prior to this tile + OffsetValuePairT total_aggregate; // The tile prefix folded with block_aggregate + if (tile_idx == 0) + { + // Scan first tile + BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); + num_segments_prefix = 0; + total_aggregate = block_aggregate; + + // Update tile status if there are successor tiles + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); + + block_aggregate = prefix_op.GetBlockAggregate(); + num_segments_prefix = prefix_op.GetExclusivePrefix().key; + total_aggregate = prefix_op.GetInclusivePrefix(); + } + + // Rezip scatter items and segment indices + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + scatter_items[ITEM].key = prev_keys[ITEM]; + scatter_items[ITEM].value = scan_items[ITEM].value; + segment_indices[ITEM] = scan_items[ITEM].key; + } + + // At this point, each flagged segment head has: + // - The key for the previous segment + // - The reduced value from the previous segment + // - The segment index for the reduced value + + // Scatter flagged keys and values + OffsetT num_tile_segments = block_aggregate.key; + Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); + + // Last thread in last tile will output final count (and last pair, if necessary) + if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) + { + OffsetT num_segments = num_segments_prefix + num_tile_segments; + + // If the last tile is a whole tile, output the final_value + if (num_remaining == TILE_ITEMS) + { + d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; + d_aggregates_out[num_segments] = total_aggregate.value; + num_segments++; + } + + // Output the total number of items selected + *d_num_runs_out = num_segments; + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_rle.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_rle.cuh new file mode 100644 index 0000000000..cb7a4a652d --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_rle.cuh @@ -0,0 +1,837 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentRle + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentRlePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for data + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values + typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRle +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + typedef typename std::iterator_traits::value_type T; + + /// The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + /// Tuple type for scanning (pairs run-length and run-index) + typedef KeyValuePair LengthOffsetPair; + + /// Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Constants + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, + WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// Whether or not to sync after loading data + SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, + ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, + }; + + + /** + * Special operator that signals all out-of-bounds items are not equal to everything else, + * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked + * trivial. + */ + template + struct OobInequalityOp + { + OffsetT num_remaining; + EqualityOpT equality_op; + + __device__ __forceinline__ OobInequalityOp( + OffsetT num_remaining, + EqualityOpT equality_op) + : + num_remaining(num_remaining), + equality_op(equality_op) + {} + + template + __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx) + { + if (!LAST_TILE || (idx < num_remaining)) + return !equality_op(first, second); + else + return true; + } + }; + + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for data + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedVLengthnputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Parameterized BlockLoad type for data + typedef BlockLoad< + T, + AgentRlePolicyT::BLOCK_THREADS, + AgentRlePolicyT::ITEMS_PER_THREAD, + AgentRlePolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockDiscontinuity type for data + typedef BlockDiscontinuity BlockDiscontinuityT; + + // Parameterized WarpScan type + typedef WarpScan WarpScanPairs; + + // Reduce-length-by-run scan operator + typedef ReduceBySegmentOp ReduceBySegmentOpT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + LengthOffsetPair, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Warp exchange types + typedef WarpExchange WarpExchangePairs; + + typedef typename If::Type WarpExchangePairsStorage; + + typedef WarpExchange WarpExchangeOffsets; + typedef WarpExchange WarpExchangeLengths; + + typedef LengthOffsetPair WarpAggregates[WARPS]; + + // Shared memory type for this thread block + struct _TempStorage + { + // Aliasable storage layout + union Aliasable + { + struct + { + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans + Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + }; + + // Smem needed for input loading + typename BlockLoadT::TempStorage load; + + // Aliasable layout needed for two-phase scatter + union ScatterAliasable + { + unsigned long long align; + WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; + typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; + typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; + + } scatter_aliasable; + + } aliasable; + + OffsetT tile_idx; // Shared tile index + LengthOffsetPair tile_inclusive; // Inclusive tile prefix + LengthOffsetPair tile_exclusive; // Exclusive tile prefix + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + + WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets + LengthsOutputIteratorT d_lengths_out; ///< Output run lengths + + EqualityOpT equality_op; ///< T equality operator + ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator + OffsetT num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentRle( + TempStorage &temp_storage, ///< [in] Reference to temp_storage + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths + EqualityOpT equality_op, ///< [in] T equality operator + OffsetT num_items) ///< [in] Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_offsets_out(d_offsets_out), + d_lengths_out(d_lengths_out), + equality_op(equality_op), + scan_op(cub::Sum()), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_remaining, + T (&items)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) + { + bool head_flags[ITEMS_PER_THREAD]; + bool tail_flags[ITEMS_PER_THREAD]; + + OobInequalityOp inequality_op(num_remaining, equality_op); + + if (FIRST_TILE && LAST_TILE) + { + // First-and-last-tile always head-flags the first item and tail-flags the last item + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tail_flags, items, inequality_op); + } + else if (FIRST_TILE) + { + // First-tile always head-flags the first item + + // Get the first item from the next tile + T tile_successor_item; + if (threadIdx.x == BLOCK_THREADS - 1) + tile_successor_item = d_in[tile_offset + TILE_ITEMS]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tail_flags, tile_successor_item, items, inequality_op); + } + else if (LAST_TILE) + { + // Last-tile always flags the last item + + // Get the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[tile_offset - 1]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tile_predecessor_item, tail_flags, items, inequality_op); + } + else + { + // Get the first item from the next tile + T tile_successor_item; + if (threadIdx.x == BLOCK_THREADS - 1) + tile_successor_item = d_in[tile_offset + TILE_ITEMS]; + + // Get the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[tile_offset - 1]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); + } + + // Zip counts and runs + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); + lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); + } + } + + //--------------------------------------------------------------------- + // Scan utility methods + //--------------------------------------------------------------------- + + /** + * Scan of allocations + */ + __device__ __forceinline__ void WarpScanAllocations( + LengthOffsetPair &tile_aggregate, + LengthOffsetPair &warp_aggregate, + LengthOffsetPair &warp_exclusive_in_tile, + LengthOffsetPair &thread_exclusive_in_warp, + LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) + { + // Perform warpscans + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + LengthOffsetPair identity; + identity.key = 0; + identity.value = 0; + + LengthOffsetPair thread_inclusive; + LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); + WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan( + thread_aggregate, + thread_inclusive, + thread_exclusive_in_warp, + identity, + scan_op); + + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive; + + CTA_SYNC(); + + // Accumulate total selected and the warp-wide prefix + warp_exclusive_in_tile = identity; + warp_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[warp_id]; + tile_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[0]; + + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_exclusive_in_tile = tile_aggregate; + + tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]); + } + } + + + //--------------------------------------------------------------------- + // Utility methods for scattering selections + //--------------------------------------------------------------------- + + /** + * Two-phase scatter, specialized for warp time-slicing + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], + Int2Type is_warp_time_slice) + { + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + // Locally compact items within the warp (first warp) + if (warp_id == 0) + { + WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( + lengths_and_offsets, thread_num_runs_exclusive_in_warp); + } + + // Locally compact items within the warp (remaining warps) + #pragma unroll + for (int SLICE = 1; SLICE < WARPS; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( + lengths_and_offsets, thread_num_runs_exclusive_in_warp); + } + } + + // Global scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + (ITEM * WARP_THREADS) + lane_id; + + // Scatter offset + d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; + + // Scatter length if not the first (global) length + if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) + { + d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; + } + } + } + } + + + /** + * Two-phase scatter + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], + Int2Type is_warp_time_slice) + { + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + // Unzip + OffsetT run_offsets[ITEMS_PER_THREAD]; + LengthT run_lengths[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + run_offsets[ITEM] = lengths_and_offsets[ITEM].key; + run_lengths[ITEM] = lengths_and_offsets[ITEM].value; + } + + WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped( + run_offsets, thread_num_runs_exclusive_in_warp); + + WARP_SYNC(0xffffffff); + + WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped( + run_lengths, thread_num_runs_exclusive_in_warp); + + // Global scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + (ITEM * WARP_THREADS) + lane_id; + + // Scatter offset + d_offsets_out[item_offset] = run_offsets[ITEM]; + + // Scatter length if not the first (global) length + if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) + { + d_lengths_out[item_offset - 1] = run_lengths[ITEM]; + } + } + } + } + + + /** + * Direct scatter + */ + template + __device__ __forceinline__ void ScatterDirect( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + thread_num_runs_exclusive_in_warp[ITEM]; + + // Scatter offset + d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; + + // Scatter length if not the first (global) length + if (item_offset >= 1) + { + d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; + } + } + } + } + + + /** + * Scatter + */ + template + __device__ __forceinline__ void Scatter( + OffsetT tile_num_runs_aggregate, + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) + { + if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) + { + // Direct scatter if the warp has any items + if (warp_num_runs_aggregate) + { + ScatterDirect( + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + } + } + else + { + // Scatter two phase + ScatterTwoPhase( + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets, + Int2Type()); + } + } + + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template < + bool LAST_TILE> + __device__ __forceinline__ LengthOffsetPair ConsumeTile( + OffsetT num_items, ///< Total number of global input items + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT &tile_status) ///< Global list of tile status + { + if (tile_idx == 0) + { + // First tile + + // Load items + T items[ITEMS_PER_THREAD]; + if (LAST_TILE) + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); + else + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); + + if (SYNC_AFTER_LOAD) + CTA_SYNC(); + + // Set flags + LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; + + InitializeSelections( + tile_offset, + num_remaining, + items, + lengths_and_num_runs); + + // Exclusive scan of lengths and runs + LengthOffsetPair tile_aggregate; + LengthOffsetPair warp_aggregate; + LengthOffsetPair warp_exclusive_in_tile; + LengthOffsetPair thread_exclusive_in_warp; + + WarpScanAllocations( + tile_aggregate, + warp_aggregate, + warp_exclusive_in_tile, + thread_exclusive_in_warp, + lengths_and_num_runs); + + // Update tile status if this is not the last tile + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, tile_aggregate); + + // Update thread_exclusive_in_warp to fold in warp run-length + if (thread_exclusive_in_warp.key == 0) + thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; + + LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; + OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; + LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; + + // Downsweep scan through lengths_and_num_runs + internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); + + // Zip + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; + lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? + lengths_and_num_runs2[ITEM].key : // keep + WARP_THREADS * ITEMS_PER_THREAD; // discard + } + + OffsetT tile_num_runs_aggregate = tile_aggregate.key; + OffsetT tile_num_runs_exclusive_in_global = 0; + OffsetT warp_num_runs_aggregate = warp_aggregate.key; + OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; + + // Scatter + Scatter( + tile_num_runs_aggregate, + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + + // Return running total (inclusive of this tile) + return tile_aggregate; + } + else + { + // Not first tile + + // Load items + T items[ITEMS_PER_THREAD]; + if (LAST_TILE) + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); + else + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); + + if (SYNC_AFTER_LOAD) + CTA_SYNC(); + + // Set flags + LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; + + InitializeSelections( + tile_offset, + num_remaining, + items, + lengths_and_num_runs); + + // Exclusive scan of lengths and runs + LengthOffsetPair tile_aggregate; + LengthOffsetPair warp_aggregate; + LengthOffsetPair warp_exclusive_in_tile; + LengthOffsetPair thread_exclusive_in_warp; + + WarpScanAllocations( + tile_aggregate, + warp_aggregate, + warp_exclusive_in_tile, + thread_exclusive_in_warp, + lengths_and_num_runs); + + // First warp computes tile prefix in lane 0 + TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx); + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + if (warp_id == 0) + { + prefix_op(tile_aggregate); + if (threadIdx.x == 0) + temp_storage.tile_exclusive = prefix_op.exclusive_prefix; + } + + CTA_SYNC(); + + LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; + + // Update thread_exclusive_in_warp to fold in warp and tile run-lengths + LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); + if (thread_exclusive_in_warp.key == 0) + thread_exclusive_in_warp.value += thread_exclusive.value; + + // Downsweep scan through lengths_and_num_runs + LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; + LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; + OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; + + internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); + + // Zip + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; + lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? + lengths_and_num_runs2[ITEM].key : // keep + WARP_THREADS * ITEMS_PER_THREAD; // discard + } + + OffsetT tile_num_runs_aggregate = tile_aggregate.key; + OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; + OffsetT warp_num_runs_aggregate = warp_aggregate.key; + OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; + + // Scatter + Scatter( + tile_num_runs_aggregate, + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + + // Return running total (inclusive of this tile) + return prefix_op.inclusive_prefix; + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_status, ///< Global list of tile status + NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (tile_idx < num_tiles - 1) + { + // Not the last tile (full) + ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); + } + else if (num_remaining > 0) + { + // The last tile (possibly partially-full) + LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); + + if (threadIdx.x == 0) + { + // Output the total number of items selected + *d_num_runs_out = running_total.key; + + // The inclusive prefix contains accumulated length reduction for the last run + if (running_total.key > 0) + d_lengths_out[running_total.key - 1] = running_total.value; + } + } + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_scan.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_scan.cuh new file mode 100644 index 0000000000..9368615ef4 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_scan.cuh @@ -0,0 +1,471 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentScan + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentScanPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ +template < + typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type + typename OutputIteratorT, ///< Random-access output iterator type + typename ScanOpT, ///< Scan functor type + typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) + typename OffsetT> ///< Signed integer type for global offsets +struct AgentScan +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Constants + enum + { + IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided + BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Parameterized BlockLoad type + typedef BlockLoad< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockStore type + typedef BlockStore< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::STORE_ALGORITHM> + BlockStoreT; + + // Parameterized BlockScan type + typedef BlockScan< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OutputT, + ScanOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles + typedef BlockScanRunningPrefixOp< + OutputT, + ScanOpT> + RunningPrefixCallbackOp; + + // Shared memory type for this thread block + union _TempStorage + { + typename BlockLoadT::TempStorage load; // Smem needed for tile loading + typename BlockStoreT::TempStorage store; // Smem needed for tile storing + + struct + { + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + }; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input data + OutputIteratorT d_out; ///< Output data + ScanOpT scan_op; ///< Binary scan operator + InitValueT init_value; ///< The init_value element for ScanOpT + + + //--------------------------------------------------------------------- + // Block scan utility methods + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + OutputT init_value, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); + block_aggregate = scan_op(init_value, block_aggregate); + } + + + /** + * Inclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + InitValueT /*init_value*/, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + } + + + /** + * Exclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); + } + + + /** + * Inclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentScan( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanOpT scan_op, ///< Binary scan operator + InitValueT init_value) ///< Initial value to seed the exclusive scan + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out(d_out), + scan_op(scan_op), + init_value(init_value) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Load items + OutputT items[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Perform tile scan + if (tile_idx == 0) + { + // Scan first tile + OutputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Store items + if (IS_LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining); + else + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } + + + //--------------------------------------------------------------------- + // Scan an sequence of consecutive tiles (independent of other thread blocks) + //--------------------------------------------------------------------- + + /** + * Process a tile of input + */ + template < + bool IS_FIRST_TILE, + bool IS_LAST_TILE> + __device__ __forceinline__ void ConsumeTile( + OffsetT tile_offset, ///< Tile offset + RunningPrefixCallbackOp& prefix_op, ///< Running prefix operator + int valid_items = TILE_ITEMS) ///< Number of valid items in the tile + { + // Load items + OutputT items[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Block scan + if (IS_FIRST_TILE) + { + OutputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + prefix_op.running_total = block_aggregate; + } + else + { + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Store items + if (IS_LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items); + else + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); + } + + + /** + * Scan a consecutive share of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT range_end) ///< [in] Threadblock end offset (exclusive) + { + BlockScanRunningPrefixOp prefix_op(scan_op); + + if (range_offset + TILE_ITEMS <= range_end) + { + // Consume first tile of input (full) + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + + // Consume subsequent full tiles of input + while (range_offset + TILE_ITEMS <= range_end) + { + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (range_offset < range_end) + { + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + else + { + // Consume the first tile of input (partially-full) + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + + + /** + * Scan a consecutive share of input tiles, seeded with the specified prefix value + */ + __device__ __forceinline__ void ConsumeRange( + OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT range_end, ///< [in] Threadblock end offset (exclusive) + OutputT prefix) ///< [in] The prefix to apply to the scan segment + { + BlockScanRunningPrefixOp prefix_op(prefix, scan_op); + + // Consume full tiles of input + while (range_offset + TILE_ITEMS <= range_end) + { + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (range_offset < range_end) + { + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_segment_fixup.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_segment_fixup.cuh new file mode 100644 index 0000000000..e2de58ed66 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_segment_fixup.cuh @@ -0,0 +1,375 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_discontinuity.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSegmentFixup + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSegmentFixupPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentSegmentFixup +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of key-value input iterator + typedef typename std::iterator_traits::value_type KeyValuePairT; + + // Value type + typedef typename KeyValuePairT::Value ValueT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Constants + enum + { + BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not do fixup using RLE + global atomics + USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) && + (Equals::VALUE || + Equals::VALUE || + Equals::VALUE || + Equals::VALUE), + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + PairsInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedPairsInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFixupInputIteratorT; + + // Reduce-value-by-segment scan operator + typedef ReduceByKeyOp ReduceBySegmentOpT; + + // Parameterized BlockLoad type for pairs + typedef BlockLoad< + KeyValuePairT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSegmentFixupPolicyT::LOAD_ALGORITHM> + BlockLoadPairs; + + // Parameterized BlockScan type + typedef BlockScan< + KeyValuePairT, + BLOCK_THREADS, + AgentSegmentFixupPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + KeyValuePairT, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + }; + + // Smem needed for loading keys + typename BlockLoadPairs::TempStorage load_pairs; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedPairsInputIteratorT d_pairs_in; ///< Input keys + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values + InequalityWrapper inequality_op; ///< KeyT inequality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentSegmentFixup( + TempStorage& temp_storage, ///< Reference to temp_storage + PairsInputIteratorT d_pairs_in, ///< Input keys + AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op) ///< ValueT reduction operator + : + temp_storage(temp_storage.Alias()), + d_pairs_in(d_pairs_in), + d_aggregates_out(d_aggregates_out), + d_fixup_in(d_aggregates_out), + inequality_op(equality_op), + reduction_op(reduction_op), + scan_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process input tile. Specialized for atomic-fixup + */ + template + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state, ///< Global tile state descriptor + Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + { + KeyValuePairT pairs[ITEMS_PER_THREAD]; + + // Load pairs + KeyValuePairT oob_pair; + oob_pair.key = -1; + + if (IS_LAST_TILE) + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); + else + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); + + // RLE + #pragma unroll + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; + if (pairs[ITEM].key != pairs[ITEM - 1].key) + atomicAdd(d_scatter, pairs[ITEM - 1].value); + else + pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); + } + + // Flush last item if valid + ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; + if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) + atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); + } + + + /** + * Process input tile. Specialized for reduce-by-key fixup + */ + template + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state, ///< Global tile state descriptor + Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + { + KeyValuePairT pairs[ITEMS_PER_THREAD]; + KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; + + // Load pairs + KeyValuePairT oob_pair; + oob_pair.key = -1; + + if (IS_LAST_TILE) + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); + else + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); + + CTA_SYNC(); + + KeyValuePairT tile_aggregate; + if (tile_idx == 0) + { + // Exclusive scan of values and segment_flags + BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); + + // Update tile status if this is not the last tile + if (threadIdx.x == 0) + { + // Set first segment id to not trigger a flush (invalid from exclusive scan) + scatter_pairs[0].key = pairs[0].key; + + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, tile_aggregate); + + } + } + else + { + // Exclusive scan of values and segment_flags + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); + tile_aggregate = prefix_op.GetBlockAggregate(); + } + + // Scatter updated values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scatter_pairs[ITEM].key != pairs[ITEM].key) + { + // Update the value at the key location + ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; + value = reduction_op(value, scatter_pairs[ITEM].value); + + d_aggregates_out[scatter_pairs[ITEM].key] = value; + } + } + + // Finalize the last item + if (IS_LAST_TILE) + { + // Last thread will output final count and last item, if necessary + if (threadIdx.x == BLOCK_THREADS - 1) + { + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + // Update the value at the key location + OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; + d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); + } + } + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not the last tile (full) + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); + } + else if (num_remaining > 0) + { + // The last tile (possibly partially-full) + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_select_if.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_select_if.cuh new file mode 100644 index 0000000000..52ca9fc284 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_select_if.cuh @@ -0,0 +1,703 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSelectIf + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSelectIfPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + +/** + * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for selection items + typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items + typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct AgentSelectIf +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Constants + enum + { + USE_SELECT_OP, + USE_SELECT_FLAGS, + USE_DISCONTINUITY, + + BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), + + SELECT_METHOD = (!Equals::VALUE) ? + USE_SELECT_OP : + (!Equals::VALUE) ? + USE_SELECT_FLAGS : + USE_DISCONTINUITY + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for items + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + FlagsInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFlagsInputIteratorT; + + // Parameterized BlockLoad type for input data + typedef BlockLoad< + OutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSelectIfPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockLoad type for flags + typedef BlockLoad< + FlagT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSelectIfPolicyT::LOAD_ALGORITHM> + BlockLoadFlags; + + // Parameterized BlockDiscontinuity type for items + typedef BlockDiscontinuity< + OutputT, + BLOCK_THREADS> + BlockDiscontinuityT; + + // Parameterized BlockScan type + typedef BlockScan< + OffsetT, + BLOCK_THREADS, + AgentSelectIfPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OffsetT, + cub::Sum, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Item exchange type + typedef OutputT ItemExchangeT[TILE_ITEMS]; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for loading items + typename BlockLoadT::TempStorage load_items; + + // Smem needed for loading values + typename BlockLoadFlags::TempStorage load_flags; + + // Smem needed for compacting items (allows non POD items in this union) + Uninitialized raw_exchange; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input items + SelectedOutputIteratorT d_selected_out; ///< Unique output items + WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) + InequalityWrapper inequality_op; ///< T inequality operator + SelectOpT select_op; ///< Selection operator + OffsetT num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentSelectIf( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data + FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< Output data + SelectOpT select_op, ///< Selection operator + EqualityOpT equality_op, ///< Equality operator + OffsetT num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_flags_in(d_flags_in), + d_selected_out(d_selected_out), + select_op(select_op), + inequality_op(equality_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + /** + * Initialize selections (specialized for selection operator) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT /*tile_offset*/, + OffsetT num_tile_items, + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Out-of-bounds items are selection_flags + selection_flags[ITEM] = 1; + + if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) + selection_flags[ITEM] = select_op(items[ITEM]); + } + } + + + /** + * Initialize selections (specialized for valid flags) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_tile_items, + OutputT (&/*items*/)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + CTA_SYNC(); + + FlagT flags[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Out-of-bounds items are selection_flags + BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); + } + else + { + BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); + } + + // Convert flag type to selection_flags type + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + selection_flags[ITEM] = flags[ITEM]; + } + } + + + /** + * Initialize selections (specialized for discontinuity detection) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_tile_items, + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + if (IS_FIRST_TILE) + { + CTA_SYNC(); + + // Set head selection_flags. First tile sets the first flag for the first item + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); + } + else + { + OutputT tile_predecessor; + if (threadIdx.x == 0) + tile_predecessor = d_in[tile_offset - 1]; + + CTA_SYNC(); + + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor); + } + + // Set selection flags for out-of-bounds items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + } + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + /** + * Scatter flagged items to output offsets (specialized for direct scattering) + */ + template + __device__ __forceinline__ void ScatterDirect( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + OffsetT num_selections) + { + // Scatter flagged items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selection_flags[ITEM]) + { + if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) + { + d_selected_out[selection_indices[ITEM]] = items[ITEM]; + } + } + } + } + + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int /*num_tile_items*/, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile + Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + { + CTA_SYNC(); + + // Compact and scatter items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; + if (selection_flags[ITEM]) + { + temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) + { + d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; + } + } + + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile + Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + { + CTA_SYNC(); + + int tile_num_rejections = num_tile_items - num_tile_selections; + + // Scatter items to shared memory (rejections first) + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; + int local_rejection_idx = item_idx - local_selection_idx; + int local_scatter_offset = (selection_flags[ITEM]) ? + tile_num_rejections + local_selection_idx : + local_rejection_idx; + + temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; + } + + CTA_SYNC(); + + // Gather items from shared memory and scatter to global + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + int rejection_idx = item_idx; + int selection_idx = item_idx - tile_num_rejections; + OffsetT scatter_offset = (item_idx < tile_num_rejections) ? + num_items - num_rejected_prefix - rejection_idx - 1 : + num_selections_prefix + selection_idx; + + OutputT item = temp_storage.raw_exchange.Alias()[item_idx]; + + if (!IS_LAST_TILE || (item_idx < num_tile_items)) + { + d_selected_out[scatter_offset] = item; + } + } + } + + + /** + * Scatter flagged items + */ + template + __device__ __forceinline__ void Scatter( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile + OffsetT num_selections) ///< Total number of selections including this tile + { + // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one + if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) + { + ScatterTwoPhase( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_rejected_prefix, + Int2Type()); + } + else + { + ScatterDirect( + items, + selection_flags, + selection_indices, + num_selections); + } + } + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeFirstTile( + int num_tile_items, ///< Number of input items comprising this tile + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OutputT items[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_indices[ITEMS_PER_THREAD]; + + // Load items + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + else + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + + // Initialize selection_flags + InitializeSelections( + tile_offset, + num_tile_items, + items, + selection_flags, + Int2Type()); + + CTA_SYNC(); + + // Exclusive scan of selection_flags + OffsetT num_tile_selections; + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); + + if (threadIdx.x == 0) + { + // Update tile status if this is not the last tile + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, num_tile_selections); + } + + // Discount any out-of-bounds selections + if (IS_LAST_TILE) + num_tile_selections -= (TILE_ITEMS - num_tile_items); + + // Scatter flagged items + Scatter( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + 0, + 0, + num_tile_selections); + + return num_tile_selections; + } + + + /** + * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeSubsequentTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OutputT items[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_indices[ITEMS_PER_THREAD]; + + // Load items + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + else + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + + // Initialize selection_flags + InitializeSelections( + tile_offset, + num_tile_items, + items, + selection_flags, + Int2Type()); + + CTA_SYNC(); + + // Exclusive scan of values and selection_flags + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx); + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); + + OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); + OffsetT num_selections = prefix_op.GetInclusivePrefix(); + OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); + OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix; + + // Discount any out-of-bounds selections + if (IS_LAST_TILE) + { + int num_discount = TILE_ITEMS - num_tile_items; + num_selections -= num_discount; + num_tile_selections -= num_discount; + } + + // Scatter flagged items + Scatter( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_rejected_prefix, + num_selections); + + return num_selections; + } + + + /** + * Process a tile of input + */ + template + __device__ __forceinline__ OffsetT ConsumeTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OffsetT num_selections; + if (tile_idx == 0) + { + num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); + } + else + { + num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); + } + + return num_selections; + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selection_flags + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state, ///< Global tile state descriptor + NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + + if (tile_idx < num_tiles - 1) + { + // Not the last tile (full) + ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); + } + else + { + // The last tile (possibly partially-full) + OffsetT num_remaining = num_items - tile_offset; + OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + + if (threadIdx.x == 0) + { + // Output the total number of items selection_flags + *d_num_selected_out = num_selections; + } + } + } + +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_spmv_orig.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_spmv_orig.cuh new file mode 100644 index 0000000000..54e2a13946 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/agent_spmv_orig.cuh @@ -0,0 +1,670 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + */ + +#pragma once + +#include + +#include "../util_type.cuh" +#include "../block/block_reduce.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../thread/thread_search.cuh" +#include "../thread/thread_operators.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/counting_input_iterator.cuh" +#include "../iterator/tex_ref_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSpmv + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search + CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets + CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices + CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values + CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values + bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSpmvPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) + }; + + static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets + static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets + static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices + static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values + static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +struct SpmvParams +{ + ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values + OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows; ///< Number of rows of matrix A. + int num_cols; ///< Number of columns of matrix A. + int num_nonzeros; ///< Number of nonzero elements of matrix A. + ValueT alpha; ///< Alpha multiplicand + ValueT beta; ///< Beta addend-multiplicand + + TexRefInputIterator t_vector_x; +}; + + +/** + * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 + bool HAS_BETA, ///< Whether the input parameter \p beta is 0 + int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability +struct AgentSpmv +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + /// 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + /// Input iterator wrapper types (for applying cache modifiers) + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, + OffsetT, + OffsetT> + ColumnIndicesIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + ValueIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + // Reduce-value-by-segment scan operator + typedef ReduceByKeyOp ReduceBySegmentOpT; + + // BlockReduce specialization + typedef BlockReduce< + ValueT, + BLOCK_THREADS, + BLOCK_REDUCE_WARP_REDUCTIONS> + BlockReduceT; + + // BlockScan specialization + typedef BlockScan< + KeyValuePairT, + BLOCK_THREADS, + AgentSpmvPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // BlockScan specialization + typedef BlockScan< + ValueT, + BLOCK_THREADS, + AgentSpmvPolicyT::SCAN_ALGORITHM> + BlockPrefixSumT; + + // BlockExchange specialization + typedef BlockExchange< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeT; + + /// Merge item type (either a non-zero value or a row-end offset) + union MergeItem + { + // Value type to pair with index type OffsetT (NullType if loading values directly during merge) + typedef typename If::Type MergeValueT; + + OffsetT row_end_offset; + MergeValueT nonzero; + }; + + /// Shared memory type required by this thread block + struct _TempStorage + { + CoordinateT tile_coords[2]; + + union Aliasable + { + // Smem needed for tile of merge items + MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; + + // Smem needed for block exchange + typename BlockExchangeT::TempStorage exchange; + + // Smem needed for block-wide reduction + typename BlockReduceT::TempStorage reduce; + + // Smem needed for tile scanning + typename BlockScanT::TempStorage scan; + + // Smem needed for tile prefix sum + typename BlockPrefixSumT::TempStorage prefix_sum; + + } aliasable; + }; + + /// Temporary storage type (unionable) + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + + _TempStorage& temp_storage; /// Reference to temp_storage + + SpmvParams& spmv_params; + + ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values + ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentSpmv( + TempStorage& temp_storage, ///< Reference to temp_storage + SpmvParams& spmv_params) ///< SpMV input parameter bundle + : + temp_storage(temp_storage.Alias()), + spmv_params(spmv_params), + wd_values(spmv_params.d_values), + wd_row_end_offsets(spmv_params.d_row_end_offsets), + wd_column_indices(spmv_params.d_column_indices), + wd_vector_x(spmv_params.d_vector_x), + wd_vector_y(spmv_params.d_vector_y) + {} + + + + + /** + * Consume a merge tile, specialized for direct-load of nonzeros + */ + __device__ __forceinline__ KeyValuePairT ConsumeTile( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + + // Gather the row end-offsets for the merge tile into shared memory + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for the thread's starting coordinate within the merge tile + CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + CoordinateT thread_start_coord; + + MergePathSearch( + OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal + s_tile_row_end_offsets, // List A + tile_nonzero_indices, // List B + tile_num_rows, + tile_num_nonzeros, + thread_start_coord); + + CTA_SYNC(); // Perf-sync + + // Compute the thread's merge path segment + CoordinateT thread_current_coord = thread_start_coord; + KeyValuePairT scan_segment[ITEMS_PER_THREAD]; + + ValueT running_total = 0.0; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); + OffsetT column_idx = wd_column_indices[nonzero_idx]; + ValueT value = wd_values[nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + ValueT nonzero = value * vector_value; + + OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + + if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) + { + // Move down (accumulate) + running_total += nonzero; + scan_segment[ITEM].value = running_total; + scan_segment[ITEM].key = tile_num_rows; + ++thread_current_coord.y; + } + else + { + // Move right (reset) + scan_segment[ITEM].value = running_total; + scan_segment[ITEM].key = thread_current_coord.x; + running_total = 0.0; + ++thread_current_coord.x; + } + } + + CTA_SYNC(); + + // Block-wide reduce-value-by-segment + KeyValuePairT tile_carry; + ReduceBySegmentOpT scan_op; + KeyValuePairT scan_item; + + scan_item.value = running_total; + scan_item.key = thread_current_coord.x; + + BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); + + if (tile_num_rows > 0) + { + if (threadIdx.x == 0) + scan_item.key = -1; + + // Direct scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scan_segment[ITEM].key < tile_num_rows) + { + if (scan_item.key == scan_segment[ITEM].key) + scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; + + if (HAS_ALPHA) + { + scan_segment[ITEM].value *= spmv_params.alpha; + } + + if (HAS_BETA) + { + // Update the output vector element + ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; + scan_segment[ITEM].value += addend; + } + + // Set the output vector element + spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; + } + } + } + + // Return the tile's running carry-out + return tile_carry; + } + + + + /** + * Consume a merge tile, specialized for indirect load of nonzeros + */ + __device__ __forceinline__ KeyValuePairT ConsumeTile( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + +#if (CUB_PTX_ARCH >= 520) + + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + + // Gather the nonzeros for the merge tile into shared memory + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + + ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; + ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; + ValueT* s = s_tile_nonzeros + nonzero_idx; + + if (nonzero_idx < tile_num_nonzeros) + { + + OffsetT column_idx = *ci; + ValueT value = *a; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; + vector_value = wd_vector_x[column_idx]; + + ValueT nonzero = value * vector_value; + + *s = nonzero; + } + } + + +#else + + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + + // Gather the nonzeros for the merge tile into shared memory + if (tile_num_nonzeros > 0) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); + + OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; + ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + ValueT nonzero = value * vector_value; + + s_tile_nonzeros[nonzero_idx] = nonzero; + } + } + +#endif + + // Gather the row end-offsets for the merge tile into shared memory + #pragma unroll 1 + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for the thread's starting coordinate within the merge tile + CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + CoordinateT thread_start_coord; + + MergePathSearch( + OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal + s_tile_row_end_offsets, // List A + tile_nonzero_indices, // List B + tile_num_rows, + tile_num_nonzeros, + thread_start_coord); + + CTA_SYNC(); // Perf-sync + + // Compute the thread's merge path segment + CoordinateT thread_current_coord = thread_start_coord; + KeyValuePairT scan_segment[ITEMS_PER_THREAD]; + ValueT running_total = 0.0; + + OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) + { + // Move down (accumulate) + scan_segment[ITEM].value = nonzero; + running_total += nonzero; + ++thread_current_coord.y; + nonzero = s_tile_nonzeros[thread_current_coord.y]; + } + else + { + // Move right (reset) + scan_segment[ITEM].value = 0.0; + running_total = 0.0; + ++thread_current_coord.x; + row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + } + + scan_segment[ITEM].key = thread_current_coord.x; + } + + CTA_SYNC(); + + // Block-wide reduce-value-by-segment + KeyValuePairT tile_carry; + ReduceBySegmentOpT scan_op; + KeyValuePairT scan_item; + + scan_item.value = running_total; + scan_item.key = thread_current_coord.x; + + BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); + + if (threadIdx.x == 0) + { + scan_item.key = thread_start_coord.x; + scan_item.value = 0.0; + } + + if (tile_num_rows > 0) + { + + CTA_SYNC(); + + // Scan downsweep and scatter + ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; + + if (scan_item.key != scan_segment[0].key) + { + s_partials[scan_item.key] = scan_item.value; + } + else + { + scan_segment[0].value += scan_item.value; + } + + #pragma unroll + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) + { + s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; + } + else + { + scan_segment[ITEM].value += scan_segment[ITEM - 1].value; + } + } + + CTA_SYNC(); + + #pragma unroll 1 + for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) + { + spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; + } + } + + // Return the tile's running carry-out + return tile_carry; + } + + + /** + * Consume input tile + */ + __device__ __forceinline__ void ConsumeTile( + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_merge_tiles) ///< [in] Number of merge tiles + { + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + + if (tile_idx >= num_merge_tiles) + return; + + // Read our starting coordinates + if (threadIdx.x < 2) + { + if (d_tile_coordinates == NULL) + { + // Search our starting coordinates + OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; + CoordinateT tile_coord; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coord); + + temp_storage.tile_coords[threadIdx.x] = tile_coord; + } + else + { + temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; + } + } + + CTA_SYNC(); + + CoordinateT tile_start_coord = temp_storage.tile_coords[0]; + CoordinateT tile_end_coord = temp_storage.tile_coords[1]; + + // Consume multi-segment tile + KeyValuePairT tile_carry = ConsumeTile( + tile_idx, + tile_start_coord, + tile_end_coord, + Int2Type()); + + // Output the tile's carry-out + if (threadIdx.x == 0) + { + if (HAS_ALPHA) + tile_carry.value *= spmv_params.alpha; + + tile_carry.key += tile_start_coord.x; + d_tile_carry_pairs[tile_idx] = tile_carry; + } + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/agent/single_pass_scan_operators.cuh b/GraphBLAS/CUDA/local_cub/agent/single_pass_scan_operators.cuh new file mode 100644 index 0000000000..53409bdeec --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/agent/single_pass_scan_operators.cuh @@ -0,0 +1,815 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Callback operator types for supplying BlockScan prefixes + */ + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../warp/warp_reduce.cuh" +#include "../util_arch.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Prefix functor type for maintaining a running prefix while scanning a + * region independent of other thread blocks + ******************************************************************************/ + +/** + * Stateful callback operator type for supplying BlockScan prefixes. + * Maintains a running prefix that can be applied to consecutive + * BlockScan operations. + */ +template < + typename T, ///< BlockScan value type + typename ScanOpT> ///< Wrapped scan operator type +struct BlockScanRunningPrefixOp +{ + ScanOpT op; ///< Wrapped scan operator + T running_total; ///< Running block-wide prefix + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) + : + op(op) + {} + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp( + T starting_prefix, + ScanOpT op) + : + op(op), + running_total(starting_prefix) + {} + + /** + * Prefix callback operator. Returns the block-wide running_total in thread-0. + */ + __device__ __forceinline__ T operator()( + const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs + { + T retval = running_total; + running_total = op(running_total, block_aggregate); + return retval; + } +}; + + +/****************************************************************************** + * Generic tile status interface types for block-cooperative scans + ******************************************************************************/ + +/** + * Enumerations of tile status + */ +enum ScanTileStatus +{ + SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) + SCAN_TILE_INVALID = 99, // Not yet processed + SCAN_TILE_PARTIAL, // Tile aggregate is available + SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available +}; + + +/** + * Tile status interface. + */ +template < + typename T, + bool SINGLE_WORD = Traits::PRIMITIVE> +struct ScanTileState; + + +/** + * Tile status interface specialized for scan status and value types + * that can be combined into one machine word that can be + * read/written coherently in a single access. + */ +template +struct ScanTileState +{ + // Status word type + typedef typename If<(sizeof(T) == 8), + long long, + typename If<(sizeof(T) == 4), + int, + typename If<(sizeof(T) == 2), + short, + char>::Type>::Type>::Type StatusWord; + + + // Unit word type + typedef typename If<(sizeof(T) == 8), + longlong2, + typename If<(sizeof(T) == 4), + int2, + typename If<(sizeof(T) == 2), + int, + uchar2>::Type>::Type>::Type TxnWord; + + + // Device word type + struct TileDescriptor + { + StatusWord status; + T value; + }; + + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + + // Device storage + TxnWord *d_tile_descriptors; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_descriptors(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int /*num_tiles*/, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_descriptors = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + TxnWord val = TxnWord(); + TileDescriptor *descriptor = reinterpret_cast(&val); + + if (tile_idx < num_tiles) + { + // Not-yet-set + descriptor->status = StatusWord(SCAN_TILE_INVALID); + d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + descriptor->status = StatusWord(SCAN_TILE_OOB); + d_tile_descriptors[threadIdx.x] = val; + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + TileDescriptor tile_descriptor; + do + { + __threadfence_block(); // prevent hoisting loads from loop + TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + + } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + + status = tile_descriptor.status; + value = tile_descriptor.value; + } + +}; + + + +/** + * Tile status interface specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template +struct ScanTileState +{ + // Status word type + typedef char StatusWord; + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Device storage + StatusWord *d_tile_status; + T *d_tile_partial; + T *d_tile_inclusive; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL), + d_tile_partial(NULL), + d_tile_inclusive(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + cudaError_t error = cudaSuccess; + do + { + void* allocations[3]; + size_t allocation_sizes[3]; + + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Compute allocation pointers into the single storage blob + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Alias the offsets + d_tile_status = reinterpret_cast(allocations[0]); + d_tile_partial = reinterpret_cast(allocations[1]); + d_tile_inclusive = reinterpret_cast(allocations[2]); + } + while (0); + + return error; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + // Specify storage allocation requirements + size_t allocation_sizes[3]; + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Set the necessary size of the blob + void* allocations[3]; + return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + // Update tile inclusive value + ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + // Update tile partial value + ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + do { + status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); + + __threadfence(); // prevent hoisting loads from loop or loads below above this one + + } while (status == SCAN_TILE_INVALID); + + if (status == StatusWord(SCAN_TILE_PARTIAL)) + value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + else + value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + } +}; + + +/****************************************************************************** + * ReduceByKey tile status interface types for block-cooperative scans + ******************************************************************************/ + +/** + * Tile status interface for reduction by key. + * + */ +template < + typename ValueT, + typename KeyT, + bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> +struct ReduceByKeyScanTileState; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template < + typename ValueT, + typename KeyT> +struct ReduceByKeyScanTileState : + ScanTileState > +{ + typedef ScanTileState > SuperClass; + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() : SuperClass() {} +}; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * can be combined into one machine word that can be read/written coherently in a single access. + */ +template < + typename ValueT, + typename KeyT> +struct ReduceByKeyScanTileState +{ + typedef KeyValuePairKeyValuePairT; + + // Constants + enum + { + PAIR_SIZE = sizeof(ValueT) + sizeof(KeyT), + TXN_WORD_SIZE = 1 << Log2::VALUE, + STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, + + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Status word type + typedef typename If<(STATUS_WORD_SIZE == 8), + long long, + typename If<(STATUS_WORD_SIZE == 4), + int, + typename If<(STATUS_WORD_SIZE == 2), + short, + char>::Type>::Type>::Type StatusWord; + + // Status word type + typedef typename If<(TXN_WORD_SIZE == 16), + longlong2, + typename If<(TXN_WORD_SIZE == 8), + long long, + int>::Type>::Type TxnWord; + + // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) + struct TileDescriptorBigStatus + { + KeyT key; + ValueT value; + StatusWord status; + }; + + // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) + struct TileDescriptorLittleStatus + { + ValueT value; + StatusWord status; + KeyT key; + }; + + // Device word type + typedef typename If< + (sizeof(ValueT) == sizeof(KeyT)), + TileDescriptorBigStatus, + TileDescriptorLittleStatus>::Type + TileDescriptor; + + + // Device storage + TxnWord *d_tile_descriptors; + + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() + : + d_tile_descriptors(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int /*num_tiles*/, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_descriptors = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + TxnWord val = TxnWord(); + TileDescriptor *descriptor = reinterpret_cast(&val); + + if (tile_idx < num_tiles) + { + // Not-yet-set + descriptor->status = StatusWord(SCAN_TILE_INVALID); + d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + descriptor->status = StatusWord(SCAN_TILE_OOB); + d_tile_descriptors[threadIdx.x] = val; + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive.value; + tile_descriptor.key = tile_inclusive.key; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial.value; + tile_descriptor.key = tile_partial.key; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + KeyValuePairT &value) + { +// TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); +// TileDescriptor tile_descriptor = reinterpret_cast(alias); +// +// while (tile_descriptor.status == SCAN_TILE_INVALID) +// { +// __threadfence_block(); // prevent hoisting loads from loop +// +// alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); +// tile_descriptor = reinterpret_cast(alias); +// } +// +// status = tile_descriptor.status; +// value.value = tile_descriptor.value; +// value.key = tile_descriptor.key; + + TileDescriptor tile_descriptor; + do + { + __threadfence_block(); // prevent hoisting loads from loop + TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + + } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + + status = tile_descriptor.status; + value.value = tile_descriptor.value; + value.key = tile_descriptor.key; + } + +}; + + +/****************************************************************************** + * Prefix call-back operator for coupling local block scan within a + * block-cooperative scan + ******************************************************************************/ + +/** + * Stateful block-scan prefix functor. Provides the the running prefix for + * the current tile by using the call-back warp to wait on on + * aggregates/prefixes from predecessor tiles to become available. + */ +template < + typename T, + typename ScanOpT, + typename ScanTileStateT, + int PTX_ARCH = CUB_PTX_ARCH> +struct TilePrefixCallbackOp +{ + // Parameterized warp reduce + typedef WarpReduce WarpReduceT; + + // Temporary storage type + struct _TempStorage + { + typename WarpReduceT::TempStorage warp_reduce; + T exclusive_prefix; + T inclusive_prefix; + T block_aggregate; + }; + + // Alias wrapper allowing temporary storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + // Type of status word + typedef typename ScanTileStateT::StatusWord StatusWord; + + // Fields + _TempStorage& temp_storage; ///< Reference to a warp-reduction instance + ScanTileStateT& tile_status; ///< Interface to tile status + ScanOpT scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile + + // Constructor + __device__ __forceinline__ + TilePrefixCallbackOp( + ScanTileStateT &tile_status, + TempStorage &temp_storage, + ScanOpT scan_op, + int tile_idx) + : + temp_storage(temp_storage.Alias()), + tile_status(tile_status), + scan_op(scan_op), + tile_idx(tile_idx) {} + + + // Block until all predecessors within the warp-wide window have non-invalid status + __device__ __forceinline__ + void ProcessWindow( + int predecessor_idx, ///< Preceding tile index to inspect + StatusWord &predecessor_status, ///< [out] Preceding tile status + T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles + { + T value; + tile_status.WaitForValid(predecessor_idx, predecessor_status, value); + + // Perform a segmented reduction to get the prefix for the current window. + // Use the swizzled scan operator because we are now scanning *down* towards thread0. + + int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); + window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( + value, + tail_flag, + SwizzleScanOp(scan_op)); + } + + + // BlockScan prefix callback functor (called by the first warp) + __device__ __forceinline__ + T operator()(T block_aggregate) + { + + // Update our status with our tile-aggregate + if (threadIdx.x == 0) + { + temp_storage.block_aggregate = block_aggregate; + tile_status.SetPartial(tile_idx, block_aggregate); + } + + int predecessor_idx = tile_idx - threadIdx.x - 1; + StatusWord predecessor_status; + T window_aggregate; + + // Wait for the warp-wide window of predecessor tiles to become valid + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + + // The exclusive tile prefix starts out as the current window aggregate + exclusive_prefix = window_aggregate; + + // Keep sliding the window back until we come across a tile whose inclusive prefix is known + while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) + { + predecessor_idx -= CUB_PTX_WARP_THREADS; + + // Update exclusive tile prefix with the window prefix + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); + } + + // Compute the inclusive tile prefix and update the status for this tile + if (threadIdx.x == 0) + { + inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); + tile_status.SetInclusive(tile_idx, inclusive_prefix); + + temp_storage.exclusive_prefix = exclusive_prefix; + temp_storage.inclusive_prefix = inclusive_prefix; + } + + // Return exclusive_prefix + return exclusive_prefix; + } + + // Get the exclusive prefix stored in temporary storage + __device__ __forceinline__ + T GetExclusivePrefix() + { + return temp_storage.exclusive_prefix; + } + + // Get the inclusive prefix stored in temporary storage + __device__ __forceinline__ + T GetInclusivePrefix() + { + return temp_storage.inclusive_prefix; + } + + // Get the block aggregate stored in temporary storage + __device__ __forceinline__ + T GetBlockAggregate() + { + return temp_storage.block_aggregate; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_adjacent_difference.cuh b/GraphBLAS/CUDA/local_cub/block/block_adjacent_difference.cuh new file mode 100644 index 0000000000..acef9f0568 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_adjacent_difference.cuh @@ -0,0 +1,596 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockAdjacentDifference +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T first_items[BLOCK_THREADS]; + T last_items[BLOCK_THREADS]; + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(b, a, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) + { + return flag_op(b, a); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + preds[ITERATION] = input[ITERATION - 1]; + + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + preds[ITERATION], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); + } + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + input[ITERATION], + input[ITERATION + 1], + (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); + + Iterate::FlagTails(linear_tid, flags, input, flag_op); + } + + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockAdjacentDifference() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockAdjacentDifference( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if (linear_tid == 0) + { + // Set flag for first thread-item (preds[0] is undefined) + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + } + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); + } + + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = temp_storage.last_items[linear_tid - 1]; + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/block/block_discontinuity.cuh b/GraphBLAS/CUDA/local_cub/block/block_discontinuity.cuh new file mode 100644 index 0000000000..503e3e0b04 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_discontinuity.cuh @@ -0,0 +1,1148 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be flagged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items + * that differ from their predecessors (or successors). For example, head flags are convenient + * for demarcating disjoint data segments as part of a segmented scan or reduction. + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockDiscontinuity} + * \par + * The code snippet below illustrates the head flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \par Performance Considerations + * - Incurs zero bank conflicts for most types + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockDiscontinuity +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T first_items[BLOCK_THREADS]; + T last_items[BLOCK_THREADS]; + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(a, b, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) + { + return flag_op(a, b); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + preds[ITERATION] = input[ITERATION - 1]; + + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + preds[ITERATION], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); + } + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + input[ITERATION], + input[ITERATION + 1], + (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); + + Iterate::FlagTails(linear_tid, flags, input, flag_op); + } + + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if (linear_tid == 0) + { + // Set flag for first thread-item (preds[0] is undefined) + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + } + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op); + } + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads( + * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, + * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be + * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); + } + + + + //@} end member group + /******************************************************************//** + * \name Tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. + * The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_successor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * tail_flags, thread_data, cub::Inequality(), tile_successor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + //@} end member group + /******************************************************************//** + * \name Head & tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = temp_storage.last_items[linear_tid - 1]; + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, + * thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, + * that the \p tile_predecessor_item is \p 0, and that the + * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_successor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, + * thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, + * that the \p tile_predecessor_item is \p 0, and that the + * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/block/block_exchange.cuh b/GraphBLAS/CUDA/local_cub/block/block_exchange.cuh new file mode 100644 index 0000000000..3ae9934391 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_exchange.cuh @@ -0,0 +1,1248 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. + * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - It is commonplace for blocks of threads to rearrange data items between + * threads. For example, the device-accessible memory subsystem prefers access patterns + * where data items are "striped" across threads (where consecutive threads access consecutive items), + * yet most block-wide operations prefer a "blocked" partitioning of items across threads + * (where consecutive items belong to a single thread). + * - BlockExchange supports the following types of data exchanges: + * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements + * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements + * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) + * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockExchange} + * \par + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of data striped across threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + * \par Performance Considerations + * - Proper device-specific padding ensures zero bank conflicts for most types. + * + */ +template < + typename InputT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + bool WARP_TIME_SLICING = false, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, + + TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, + TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) + INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), + PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + struct __align__(16) _TempStorage + { + InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; + }; + +public: + + /// \smemstorage{BlockExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + unsigned int lane_id; + unsigned int warp_id; + unsigned int warp_offset; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + if (warp_id == 0) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + #pragma unroll + for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + // No timeslicing + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + // Warp time-slicing + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Write a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + } + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + CTA_SYNC(); + + const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockExchange() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockExchange( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + lane_id(LaneId()), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + //@} end member group + /******************************************************************//** + * \name Structured exchanges + *********************************************************************/ + //@{ + + /** + * \brief Transposes data items from striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a striped arrangement across block threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(input_items, output_items, Int2Type()); + } + + + /** + * \brief Transposes data items from blocked arrangement to striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a striped arrangement across threads + * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); + * + * // Store data striped across block threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in + * preparation for storing to device-accessible memory. + * + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + BlockedToStriped(input_items, output_items, Int2Type()); + } + + + + /** + * \brief Transposes data items from warp-striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a warp-striped arrangement across warp threads + * int thread_data[4]; + * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of warp-striped input \p thread_data across the block of threads is + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * after loading from device-accessible memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + WarpStripedToBlocked(input_items, output_items, Int2Type()); + } + + + + /** + * \brief Transposes data items from blocked arrangement to warp-striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a warp-striped arrangement across threads + * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); + * + * // Store data striped across warp threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * in preparation for storing to device-accessible memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + BlockedToWarpStriped(input_items, output_items, Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Scatter exchanges + *********************************************************************/ + //@{ + + + /** + * \brief Exchanges data items annotated by rank into blocked arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(input_items, output_items, ranks, Int2Type()); + } + + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(input_items, output_items, ranks, Int2Type()); + } + + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStripedGuarded( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (ranks[ITEM] >= 0) + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + * \tparam ValidFlag [inferred] FlagT type denoting which items are valid + */ + template + __device__ __forceinline__ void ScatterToStripedFlagged( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (is_valid[ITEM]) + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + //@} end member group + + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + + __device__ __forceinline__ void StripedToBlocked( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(items, items); + } + + __device__ __forceinline__ void BlockedToStriped( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + BlockedToStriped(items, items); + } + + __device__ __forceinline__ void WarpStripedToBlocked( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + WarpStripedToBlocked(items, items); + } + + __device__ __forceinline__ void BlockedToWarpStriped( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + BlockedToWarpStriped(items, items); + } + + template + __device__ __forceinline__ void ScatterToBlocked( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStriped( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStripedGuarded( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStripedGuarded(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStripedFlagged( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + ScatterToStriped(items, items, ranks, is_valid); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +template < + typename T, + int ITEMS_PER_THREAD, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + // Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) + INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), + PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + struct _TempStorage + { + T buff[WARP_ITEMS + PADDING_ITEMS]; + }; + +public: + + /// \smemstorage{WarpExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + int lane_id; + +public: + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpExchange( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); + temp_storage.buff[ranks[ITEM]] = items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage.buff[item_offset]; + } + } + +}; + + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_histogram.cuh b/GraphBLAS/CUDA/local_cub/block/block_histogram.cuh new file mode 100644 index 0000000000..b7cb9700e6 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_histogram.cuh @@ -0,0 +1,415 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_histogram_sort.cuh" +#include "specializations/block_histogram_atomic.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. + */ +enum BlockHistogramAlgorithm +{ + + /** + * \par Overview + * Sorting followed by differentiation. Execution is comprised of two phases: + * -# Sort the data using efficient radix sort + * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + */ + BLOCK_HISTO_SORT, + + + /** + * \par Overview + * Use atomic addition to update byte counts directly + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + */ + BLOCK_HISTO_ATOMIC, +}; + + + +/****************************************************************************** + * Block histogram + ******************************************************************************/ + + +/** + * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) + * \ingroup BlockModule + * + * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam BINS The number bins within the histogram + * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * - BlockHistogram can be optionally specialized to use different algorithms: + * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) + * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockHistogram} + * \par + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char data[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(data, smem_histogram); + * + * \endcode + * + * \par Performance and Usage Considerations + * - The histogram output can be constructed in shared or device-accessible memory + * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + int BINS, + BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockHistogram +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used + * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used + * regardless. + */ + static const BlockHistogramAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? + BLOCK_HISTO_SORT : + ALGORITHM; + + /// Internal specialization. + typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT), + BlockHistogramSort, + BlockHistogramAtomic >::Type InternalBlockHistogram; + + /// Shared memory storage layout type for BlockHistogram + typedef typename InternalBlockHistogram::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /// \smemstorage{BlockHistogram} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockHistogram() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockHistogram( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Histogram operations + *********************************************************************/ + //@{ + + + /** + * \brief Initialize the shared histogram counters to zero. + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template + __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) + { + // Initialize histogram bin counts to zeros + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + histogram[histo_offset + linear_tid] = 0; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + histogram[histo_offset + linear_tid] = 0; + } + } + + + /** + * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template < + typename CounterT > + __device__ __forceinline__ void Histogram( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + // Initialize histogram bin counts to zeros + InitHistogram(histogram); + + CTA_SYNC(); + + // Composite the histogram + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + + + + /** + * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template < + typename CounterT > + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_load.cuh b/GraphBLAS/CUDA/local_cub/block/block_load.cuh new file mode 100644 index 0000000000..217f521234 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_load.cuh @@ -0,0 +1,1241 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for reading linear tiles of data into the CUDA thread block. + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[ITEM]; + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) + { + items[ITEM] = thread_itr[ITEM]; + } + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Internal implementation for load vectorization + */ +template < + CacheLoadModifier MODIFIER, + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void InternalLoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + // Biggest memory access word that T is a whole multiple of + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), + + VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? + 4 : + (TOTAL_WORDS % 2 == 0) ? + 2 : + 1, + + VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Vector items + Vector vec_items[VECTORS_PER_THREAD]; + + // Aliased input ptr + Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) + { + vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); + } +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned + * + * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void LoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); +} + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InputIteratorT thread_itr = block_itr + linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + InputIteratorT thread_itr = block_itr + linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) + { + items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; + } + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + InputIteratorT thread_itr = block_itr + warp_offset + tid ; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + InputIteratorT thread_itr = block_itr + warp_offset + tid ; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; + } + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group + +/** @} */ // end group UtilIo + + + +//----------------------------------------------------------------------------- +// Generic BlockLoad abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ +enum BlockLoadAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * directly from memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_LOAD_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * from memory using CUDA's built-in vectorized loads as a coalescing optimization. + * For example, ld.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector load width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p InputIteratorTis not a simple pointer type + * - The block input offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_LOAD_VECTORIZE, + + /** + * \par Overview + * + * A [striped arrangement](index.html#sec5sec3) of data is read + * efficiently from memory and then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + */ + BLOCK_LOAD_TRANSPOSE, + + + /** + * \par Overview + * + * A [warp-striped arrangement](index.html#sec5sec3) of data is + * read efficiently from memory and then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly larger latencies than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + * - Provisions more shared storage, but incurs smaller latencies than the + * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. + */ + BLOCK_LOAD_WARP_TRANSPOSE, + + + /** + * \par Overview + * + * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and then is locally transposed into a + * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory + * requirement, only one warp's worth of shared memory is provisioned and is + * subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. + */ + BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, +}; + + +/** + * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockLoad class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockLoad can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory using CUDA's built-in vectorized loads as a + * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockLoad} + * \par + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ +template < + typename InputT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockLoad +{ +private: + + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Load helper + template + struct LoadInternal; + + + /** + * BLOCK_LOAD_DIRECT specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_VECTORIZE specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template + __device__ __forceinline__ void Load( + InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template + __device__ __forceinline__ void Load( + const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename OffsetT> + __device__ __forceinline__ void Load( + CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); + } + + /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Load( + _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range (skips vectorization) + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + {}; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + {}; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + {}; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + }; + + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef LoadInternal InternalLoad; + + + /// Shared memory storage layout type + typedef typename InternalLoad::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + /// \smemstorage{BlockLoad} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockLoad() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockLoad( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Load a linear segment of items from memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads + * being unmasked to load portions of valid data (and other items remaining unassigned). + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., + * \p valid_items is \p 5, and the out-of-bounds default is \p -1. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads + * being unmasked to load portions of valid data (and other items are assigned \p -1) + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); + } + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_radix_rank.cuh b/GraphBLAS/CUDA/local_cub/block/block_radix_rank.cuh new file mode 100644 index 0000000000..c26451c666 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_radix_rank.cuh @@ -0,0 +1,696 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block + */ + +#pragma once + +#include + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_scan.cuh" +#include "../block/block_scan.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. + * \ingroup BlockModule + * + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RADIX_BITS The number of radix bits per digit place + * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * Blah... + * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par Examples + * \par + * - Example 1: Simple radix rank of 32-bit integer keys + * \code + * #include + * + * template + * __global__ void ExampleKernel(...) + * { + * + * \endcode + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool IS_DESCENDING, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRank +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + // Integer type for digit counters (to be packed into words of type PackedCounters) + typedef unsigned short DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte), + unsigned long long, + unsigned int>::Type PackedCounter; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // The number of packed counters per thread (plus one for padding) + PADDED_COUNTER_LANES = COUNTER_LANES + 1, + RAKING_SEGMENT = PADDED_COUNTER_LANES, + }; + +public: + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), + }; + +private: + + + /// BlockScan type + typedef BlockScan< + PackedCounter, + BLOCK_DIM_X, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScan; + + + /// Shared memory storage layout type for BlockRadixRank + struct __align__(16) _TempStorage + { + union Aliasable + { + DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; + + } aliasable; + + // Storage for scanning local ranks + typename BlockScan::TempStorage block_scan; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + /// Copy of raking segment, promoted to registers + PackedCounter cached_segment[RAKING_SEGMENT]; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /** + * Internal storage allocator + */ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Performs upsweep raking reduction, returning the aggregate + */ + __device__ __forceinline__ PackedCounter Upsweep() + { + PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; + PackedCounter *raking_ptr; + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data into registers + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + cached_segment[i] = smem_raking_ptr[i]; + } + raking_ptr = cached_segment; + } + else + { + raking_ptr = smem_raking_ptr; + } + + return internal::ThreadReduce(raking_ptr, Sum()); + } + + + /// Performs exclusive downsweep raking scan + __device__ __forceinline__ void ExclusiveDownsweep( + PackedCounter raking_partial) + { + PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; + + PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? + cached_segment : + smem_raking_ptr; + + // Exclusive raking downsweep scan + internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data back to smem + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + smem_raking_ptr[i] = cached_segment[i]; + } + } + } + + + /** + * Reset shared memory digit counters + */ + __device__ __forceinline__ void ResetCounters() + { + // Reset shared memory digit counters + #pragma unroll + for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) + { + *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; + } + } + + + /** + * Block-scan prefix callback + */ + struct PrefixCallBack + { + __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate) + { + PackedCounter block_prefix = 0; + + // Propagate totals in packed fields + #pragma unroll + for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) + { + block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); + } + + return block_prefix; + } + }; + + + /** + * Scan shared memory digit counters. + */ + __device__ __forceinline__ void ScanCounters() + { + // Upsweep scan + PackedCounter raking_partial = Upsweep(); + + // Compute exclusive sum + PackedCounter exclusive_partial; + PrefixCallBack prefix_call_back; + BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); + + // Downsweep scan with exclusive partial + ExclusiveDownsweep(exclusive_partial); + } + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit + DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem + + // Reset shared memory digit counters + ResetCounters(); + + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // Get digit + unsigned int digit = BFE(keys[ITEM], current_bit, num_bits); + + // Get sub-counter + unsigned int sub_counter = digit >> LOG_COUNTER_LANES; + + // Get counter lane + unsigned int counter_lane = digit & (COUNTER_LANES - 1); + + if (IS_DESCENDING) + { + sub_counter = PACKING_RATIO - 1 - sub_counter; + counter_lane = COUNTER_LANES - 1 - counter_lane; + } + + // Pointer to smem digit counter + digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; + + // Load thread-exclusive prefix + thread_prefixes[ITEM] = *digit_counters[ITEM]; + + // Store inclusive prefix + *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; + } + + CTA_SYNC(); + + // Scan shared memory counters + ScanCounters(); + + CTA_SYNC(); + + // Extract the local ranks of each key + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // Add in thread block exclusive prefix + ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; + } + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + // Rank keys + RankKeys(keys, ranks, current_bit, num_bits); + + // Get the inclusive and exclusive digit totals corresponding to the calling thread. + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the + // first counter column, resulting in unavoidable bank conflicts.) + unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); + unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); + + exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; + } + } + } +}; + + + + + +/** + * Radix-rank using match.any + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool IS_DESCENDING, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRankMatch +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + typedef int32_t RankT; + typedef int32_t DigitCounterT; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + PADDED_WARPS = ((WARPS & 0x1) == 0) ? + WARPS + 1 : + WARPS, + + COUNTERS = PADDED_WARPS * RADIX_DIGITS, + RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, + PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? + RAKING_SEGMENT + 1 : + RAKING_SEGMENT, + }; + +public: + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), + }; + +private: + + /// BlockScan type + typedef BlockScan< + DigitCounterT, + BLOCK_THREADS, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScanT; + + + /// Shared memory storage layout type for BlockRadixRank + struct __align__(16) _TempStorage + { + typename BlockScanT::TempStorage block_scan; + + union __align__(16) Aliasable + { + volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; + DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; + + } aliasable; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRankMatch( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + // Initialize shared digit counters + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; + + CTA_SYNC(); + + // Each warp will strip-mine its section of input, one strip at a time + + volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; + uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; + uint32_t lane_mask_lt = LaneMaskLt(); + + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // My digit + uint32_t digit = BFE(keys[ITEM], current_bit, num_bits); + + if (IS_DESCENDING) + digit = RADIX_DIGITS - digit - 1; + + // Mask of peers who have same digit as me + uint32_t peer_mask = MatchAny(digit); + + // Pointer to smem digit counter for this key + digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; + + // Number of occurrences in previous strips + DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; + + // Warp-sync + WARP_SYNC(0xFFFFFFFF); + + // Number of peers having same digit as me + int32_t digit_count = __popc(peer_mask); + + // Number of lower-ranked peers having same digit seen so far + int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); + + if (peer_digit_prefix == 0) + { + // First thread for each digit updates the shared warp counter + *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); + } + + // Warp-sync + WARP_SYNC(0xFFFFFFFF); + + // Number of prior keys having same digit + ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); + } + + CTA_SYNC(); + + // Scan warp counters + + DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; + + BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; + + CTA_SYNC(); + + // Seed ranks with counter values from previous warps + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + ranks[ITEM] += *digit_counters[ITEM]; + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + RankKeys(keys, ranks, current_bit, num_bits); + + // Get exclusive count for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; + } + } + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/block/block_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/block/block_radix_sort.cuh new file mode 100644 index 0000000000..ac0c9f85b1 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_radix_sort.cuh @@ -0,0 +1,863 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. + */ + + +#pragma once + +#include "block_exchange.cuh" +#include "block_radix_rank.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) + * \ingroup BlockModule + * + * \tparam KeyT KeyT type + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam ValueT [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) + * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending order. It relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * - BlockRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Within each key, the implementation treats fixed-length + * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting + * method can only be applied to unsigned integral types, BlockRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * - \rowmajor + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockRadixSort} + * \par + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * ... + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ +template < + typename KeyT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + typename ValueT = NullType, + int RADIX_BITS = 4, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixSort +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + // Whether or not there are values to be trucked along with keys + KEYS_ONLY = Equals::VALUE, + }; + + // KeyT traits and unsigned bits type + typedef Traits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + + /// Ascending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + false, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + AscendingBlockRadixRank; + + /// Descending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + true, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + DescendingBlockRadixRank; + + /// BlockExchange utility type for keys + typedef BlockExchange BlockExchangeKeys; + + /// BlockExchange utility type for values + typedef BlockExchange BlockExchangeValues; + + /// Shared memory storage layout type + union _TempStorage + { + typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; + typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; + typename BlockExchangeKeys::TempStorage exchange_keys; + typename BlockExchangeValues::TempStorage exchange_values; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + /// Rank keys (specialized for ascending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type /*is_descending*/) + { + AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// Rank keys (specialized for descending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type /*is_descending*/) + { + DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) + __device__ __forceinline__ void ExchangeValues( + ValueT (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + { + CTA_SYNC(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); + } + + /// ExchangeValues (specialized for key-value sort, to-striped arrangement) + __device__ __forceinline__ void ExchangeValues( + ValueT (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + { + CTA_SYNC(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); + } + + /// ExchangeValues (specialized for keys-only sort) + template + __device__ __forceinline__ void ExchangeValues( + ValueT (&/*values*/)[ITEMS_PER_THREAD], + int (&/*ranks*/)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + {} + + /// Sort blocked arrangement + template + __device__ __forceinline__ void SortBlocked( + KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + CTA_SYNC(); + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit if done + if (begin_bit >= end_bit) break; + + CTA_SYNC(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + +public: + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Sort blocked -> striped arrangement + template + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + CTA_SYNC(); + + // Check if this is the last pass + if (begin_bit >= end_bit) + { + // Last pass exchanges keys through shared memory in striped arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); + + // Last pass exchanges through shared memory in striped arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit + break; + } + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + CTA_SYNC(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + /// \smemstorage{BlockRadixSort} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangements) + *********************************************************************/ + //@{ + + /** + * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + */ + __device__ __forceinline__ void Sort( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void Sort( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + /** + * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + */ + __device__ __forceinline__ void SortDescending( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + * + */ + __device__ __forceinline__ void SortDescending( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangement -> striped arrangement) + *********************************************************************/ + //@{ + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + +}; + +/** + * \example example_block_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_raking_layout.cuh b/GraphBLAS/CUDA/local_cub/block/block_raking_layout.cuh new file mode 100644 index 0000000000..3500616863 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_raking_layout.cuh @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. + */ + + +#pragma once + +#include "../util_macro.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) + * \ingroup BlockModule + * + * \par Overview + * This type facilitates a shared memory usage pattern where a block of CUDA + * threads places elements into shared memory and then reduces the active + * parallelism to one "raking" warp of threads for serially aggregating consecutive + * sequences of shared items. Padding is inserted to eliminate bank conflicts + * (for most data types). + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_THREADS The thread block size in threads. + * \tparam PTX_ARCH [optional] \ptxversion + */ +template < + typename T, + int BLOCK_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +struct BlockRakingLayout +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// The total number of elements that need to be cooperatively reduced + SHARED_ELEMENTS = BLOCK_THREADS, + + /// Maximum number of warp-synchronous raking threads + MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), + + /// Number of raking elements per warp-synchronous raking thread (rounded up) + SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, + + /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) + RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, + + /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) + HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), + + /// Degree of bank conflicts (e.g., 4-way) + CONFLICT_DEGREE = (HAS_CONFLICTS) ? + (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : + 1, + + /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load + USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), + + /// Total number of elements in the raking grid + GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), + + /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) + UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), + }; + + + /** + * \brief Shared memory storage type + */ + struct __align__(16) _TempStorage + { + T buff[BlockRakingLayout::GRID_ELEMENTS]; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /** + * \brief Returns the location for the calling thread to place data into the grid + */ + static __device__ __forceinline__ T* PlacementPtr( + TempStorage &temp_storage, + unsigned int linear_tid) + { + // Offset for partial + unsigned int offset = linear_tid; + + // Add in one padding element for every segment + if (USE_SEGMENT_PADDING > 0) + { + offset += offset / SEGMENT_LENGTH; + } + + // Incorporating a block of padding partials every shared memory segment + return temp_storage.Alias().buff + offset; + } + + + /** + * \brief Returns the location for the calling thread to begin sequential raking + */ + static __device__ __forceinline__ T* RakingPtr( + TempStorage &temp_storage, + unsigned int linear_tid) + { + return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_reduce.cuh b/GraphBLAS/CUDA/local_cub/block/block_reduce.cuh new file mode 100644 index 0000000000..261f2ea6f5 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_reduce.cuh @@ -0,0 +1,607 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_reduce_raking.cuh" +#include "specializations/block_reduce_raking_commutative_only.cuh" +#include "specializations/block_reduce_warp_reductions.cuh" +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * BlockReduceAlgorithm enumerates alternative algorithms for parallel + * reduction across a CUDA thread block. + */ +enum BlockReduceAlgorithm +{ + + /** + * \par Overview + * An efficient "raking" reduction algorithm that only supports commutative + * reduction operators (true for most operations, e.g., addition). + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Threads in warps other than the first warp place + * their partial reductions into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within the first + * warp continue to accumulate by raking across segments of shared partial reductions + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE + * and is preferable when the reduction operator is commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, + + + /** + * \par Overview + * An efficient "raking" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. \blocked. + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a + * single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs more communication than BLOCK_REDUCE_RAKING + * and is only preferable when the reduction operator is non-commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING, + + + /** + * \par Overview + * A quick "tiled warp-reductions" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. + * + * \par + * Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style + * reduction within each warp. + * -# A propagation phase where the warp reduction outputs in each warp are + * updated with the aggregate from each preceding warp. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING + * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall + * throughput across the GPU. However turn-around latency may be lower and + * thus useful when the GPU is under-occupied. + */ + BLOCK_REDUCE_WARP_REDUCTIONS, +}; + + +/****************************************************************************** + * Block reduce + ******************************************************************************/ + +/** + * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being reduced + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - \rowmajor + * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: + * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Very efficient (only one synchronization barrier). + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - \p BLOCK_THREADS is a multiple of the architecture's warp size + * - Every thread has a valid input (i.e., full vs. partial-tiles) + * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockReduce} + * \par + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + typedef BlockReduceWarpReductions WarpReductions; + typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; + typedef BlockReduceRaking Raking; + + /// Internal specialization type + typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), + WarpReductions, + typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), + RakingCommutativeOnly, + Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking + + /// Shared memory storage layout type for BlockReduce + typedef typename InternalBlockReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockReduce() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, reduction_op); + return Reduce(partial, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * if (threadIdx.x < num_valid) thread_data = ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction functor + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + else + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + } + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); + } + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ T Sum( + T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, cub::Sum()); + return Sum(partial); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item (up to num_items) + * int thread_data; + * if (threadIdx.x < num_valid) + * thread_data = ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + else + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + } + + + //@} end member group +}; + +/** + * \example example_block_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_scan.cuh b/GraphBLAS/CUDA/local_cub/block/block_scan.cuh new file mode 100644 index 0000000000..27ea7ed409 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_scan.cuh @@ -0,0 +1,2126 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_scan_raking.cuh" +#include "specializations/block_scan_warp_scans.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. + */ +enum BlockScanAlgorithm +{ + + /** + * \par Overview + * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. + * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_raking.png + *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer longer turnaround latencies when the + * GPU is under-occupied, it can often provide higher overall throughput + * across the GPU when suitably occupied. + */ + BLOCK_SCAN_RAKING, + + + /** + * \par Overview + * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at + * the expense of higher register pressure. Raking threads preserve their + * "upsweep" segment of values in registers while performing warp-synchronous + * scan, allowing the "downsweep" not to re-read them from shared memory. + */ + BLOCK_SCAN_RAKING_MEMOIZE, + + + /** + * \par Overview + * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. + * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer lower overall throughput across the + * GPU because due to a heavy reliance on inefficient warpscans, it can + * often provide lower turnaround latencies when the GPU is under-occupied. + */ + BLOCK_SCAN_WARP_SCANS, +}; + + +/****************************************************************************** + * Block scan + ******************************************************************************/ + +/** + * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being scanned + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - \rowmajor + * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: + * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Invokes a minimal number of minimal block-wide synchronization barriers (only + * one or two depending on algorithm selection) + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Prefix sum variants (vs. generic scan) + * - \blocksize + * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockScan} + * \par + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. + * The corresponding output \p thread_data in those threads will be + * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy + * cannot be used with thread block sizes not a multiple of the + * architectural warp size. + */ + static const BlockScanAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? + BLOCK_SCAN_RAKING : + ALGORITHM; + + typedef BlockScanWarpScans WarpScans; + typedef BlockScanRaking Raking; + + /// Define the delegate type for the desired algorithm + typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), + WarpScans, + Raking>::Type InternalBlockScan; + + /// Shared memory storage layout type for BlockScan + typedef typename InternalBlockScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Public types + ******************************************************************************/ +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockScan() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, ..., 127. + * The output for the second segment will be 128, 129, ..., 255. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage.scan).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. + * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + + //@} end member group // Exclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. + * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group // Inclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage.scan).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. + * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + //@} end member group +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, single datum per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); + } + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + //@} end member group +#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Inclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + InclusiveScan(input, output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InclusiveScan(input, output, cub::Sum(), block_aggregate); + } + + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, ..., 128. + * The output for the second segment will be 129, 130, ..., 256. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0]); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be + * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage.scan).IncluisveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. + * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. + * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan (with no initial value) + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage.scan).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. + * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + //@} end member group + + +}; + +/** + * \example example_block_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_shuffle.cuh b/GraphBLAS/CUDA/local_cub/block/block_shuffle.cuh new file mode 100644 index 0000000000..a0cc71d222 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_shuffle.cuh @@ -0,0 +1,305 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_arch.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * It is commonplace for blocks of threads to rearrange data items between + * threads. The BlockShuffle abstraction allows threads to efficiently shift items + * either (a) up to their successor or (b) down to their predecessor. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockShuffle +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + enum + { + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T prev[BLOCK_THREADS]; + T next[BLOCK_THREADS]; + }; + + +public: + + /// \smemstorage{BlockShuffle} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockShuffle() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockShuffle( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Shuffle movement + *********************************************************************/ + //@{ + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Offset( + T input, ///< [in] The input item from the calling thread (threadi) + T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 + int distance = 1) ///< [in] Offset distance (may be negative) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS)) + output = temp_storage[linear_tid + distance].prev; + } + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Rotate( + T input, ///< [in] The calling thread's input item + T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 + unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + unsigned int offset = threadIdx.x + distance; + if (offset >= BLOCK_THREADS) + offset -= BLOCK_THREADS; + + output = temp_storage[offset].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads + { + Up(input, prev); + block_suffix = temp_storage[BLOCK_THREADS - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads + { + Up(input, prev); + block_prefix = temp_storage[BLOCK_THREADS - 1].prev; + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/block_store.cuh b/GraphBLAS/CUDA/local_cub/block/block_store.cuh new file mode 100644 index 0000000000..648bf9ff4d --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/block_store.cuh @@ -0,0 +1,1000 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for writing linear segments of data from the CUDA thread block + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[ITEM] = items[ITEM]; + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) + { + thread_itr[ITEM] = items[ITEM]; + } + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, + * which is the default starting offset returned by \p cudaMalloc() + * + * \par + * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void StoreDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + enum + { + // Maximum CUDA vector size is 4 elements + MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), + + // Vector size must be a power of two and an even divisor of the items per thread + VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? + MAX_VEC_SIZE : + 1, + + VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Alias global pointer + Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); + + // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) + Vector raw_vector[VECTORS_PER_THREAD]; + T *raw_items = reinterpret_cast(raw_vector); + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + raw_items[ITEM] = items[ITEM]; + } + + // Direct-store using vector types + StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); +} + + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } + } +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } + } +} + + +//@} end member group + + +/** @} */ // end group UtilIo + + +//----------------------------------------------------------------------------- +// Generic BlockStore abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. + */ +enum BlockStoreAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written directly + * to memory using CUDA's built-in vectorized stores as a coalescing optimization. + * For example, st.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector store width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p OutputIteratorT is not a simple pointer type + * - The block output offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_STORE_VECTORIZE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_WARP_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * To reduce the shared memory requirement, only one warp's worth of shared + * memory is provisioned and is subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. + */ + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + +}; + + +/** + * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam T The type of data to be written. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockStore class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockStore can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is written directly to memory using CUDA's built-in vectorized stores as a + * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockStore} + * \par + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockStore +{ +private: + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Store helper + template + struct StoreInternal; + + + /** + * BLOCK_STORE_DIRECT specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_VECTORIZE specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) + __device__ __forceinline__ void Store( + T *block_ptr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToStriped(items); + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef StoreInternal InternalStore; + + + /// Shared memory storage layout type + typedef typename InternalStore::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + + /// \smemstorage{BlockStore} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockStore() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockStore( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Store items into a linear segment of memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items); + } + + /** + * \brief Store items into a linear segment of memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. + * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with + * only the first two threads being unmasked to store portions of valid data. + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_atomic.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_atomic.cuh new file mode 100644 index 0000000000..29db0df710 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_atomic.cuh @@ -0,0 +1,82 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template +struct BlockHistogramAtomic +{ + /// Shared memory storage layout type + struct TempStorage {}; + + + /// Constructor + __device__ __forceinline__ BlockHistogramAtomic( + TempStorage &temp_storage) + {} + + + /// Composite data onto an existing histogram + template < + typename T, + typename CounterT, + int ITEMS_PER_THREAD> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + // Update histogram + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + atomicAdd(histogram + items[i], 1); + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_sort.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_sort.cuh new file mode 100644 index 0000000000..9ef417adca --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_sort.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template < + typename T, ///< Sample type + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int ITEMS_PER_THREAD, ///< The number of samples per thread + int BINS, ///< The number of bins into which histogram samples may fall + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockHistogramSort +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort< + T, + BLOCK_DIM_X, + ITEMS_PER_THREAD, + NullType, + 4, + (PTX_ARCH >= 350) ? true : false, + BLOCK_SCAN_WARP_SCANS, + cudaSharedMemBankSizeFourByte, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity< + T, + BLOCK_DIM_X, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockDiscontinuityT; + + /// Shared memory + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + unsigned int run_begin[BINS]; + unsigned int run_end[BINS]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockHistogramSort( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + // Composite data onto an existing histogram + template < + typename CounterT > + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + CTA_SYNC(); + + // Initialize the shared memory's run_begin and run_end for each bin + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + + CTA_SYNC(); + + int flags[ITEMS_PER_THREAD]; // unused + + // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; + + CTA_SYNC(); + + // Composite into histogram + histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + int thread_offset = histo_offset + linear_tid; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + + // Finish up with guarded composition if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + int thread_offset = histo_offset + linear_tid; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking.cuh new file mode 100644 index 0000000000..aff97fc9b5 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../block/block_raking_layout.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * + * Supports non-commutative binary reduction operators. Unlike commutative + * reduction operators (e.g., addition), the application of a non-commutative + * reduction operator (e.g, string concatenation) across a sequence of inputs must + * honor the relative ordering of items and partial reductions when applying the + * reduction operator. + * + * Compared to the implementation of BlockReduceRaking (which does not support + * non-commutative operators), this implementation requires a few extra + * rounds of inter-thread communication. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), + + /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two + WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, + + /// Whether or not accesses into smem are unguarded + RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, + + }; + + + /// Shared memory storage layout type + union _TempStorage + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) + { + // Update partial if addend is in range + if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) + { + T addend = raking_segment[ITERATION]; + partial = reduction_op(partial, addend); + } + return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T * /*raking_segment*/, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) + { + return partial; + } + + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool IS_FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + num_valid, + reduction_op); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + int valid_raking_threads = (IS_FULL_TILE) ? + RAKING_THREADS : + (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH; + + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + valid_raking_threads, + reduction_op); + + } + } + + return partial; + } + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + + return Reduce(partial, num_valid, reduction_op); + } + + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking_commutative_only.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking_commutative_only.cuh new file mode 100644 index 0000000000..454fdafa50 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -0,0 +1,199 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. + */ + +#pragma once + +#include "block_reduce_raking.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRakingCommutativeOnly +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values + typedef BlockReduceRaking FallBack; + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Whether or not to use fall-back + USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), + + /// Number of raking threads + RAKING_THREADS = WARP_THREADS, + + /// Number of threads actually sharing items with the raking threads + SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, + }; + + /// WarpReduce utility type + typedef WarpReduce WarpReduce; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Shared memory storage layout type + union _TempStorage + { + struct + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + }; + typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRakingCommutativeOnly( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Sum(partial); + } + } + + return partial; + } + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = internal::ThreadReduce(raking_segment, reduction_op, partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_warp_reductions.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_warp_reductions.cuh new file mode 100644 index 0000000000..10ba303b4c --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_warp_reductions.cuh @@ -0,0 +1,218 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../warp/warp_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceWarpReductions +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// The logical warp size for warp reductions + LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + + /// Whether or not the logical warp size evenly divides the thread block size + EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) + }; + + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire thread block + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + int warp_id; + int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockReduceWarpReductions( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) + { + if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) + { + T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; + warp_aggregate = reduction_op(warp_aggregate, addend); + } + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) + { + return warp_aggregate; + } + + + /// Returns block-wide aggregate in thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + // Share lane aggregates + if (lane_id == 0) + { + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + } + + CTA_SYNC(); + + // Update total aggregate in warp 0, lane 0 + if (linear_tid == 0) + { + warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); + } + + return warp_aggregate; + } + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + int warp_offset = (warp_id * LOGICAL_WARP_SIZE); + int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? + LOGICAL_WARP_SIZE : + num_valid - warp_offset; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( + input, + warp_num_valid, + cub::Sum()); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + int warp_offset = warp_id * LOGICAL_WARP_SIZE; + int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? + LOGICAL_WARP_SIZE : + num_valid - warp_offset; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( + input, + warp_num_valid, + reduction_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_raking.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_raking.cuh new file mode 100644 index 0000000000..a855cda0ba --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_raking.cuh @@ -0,0 +1,666 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + +/** + * \file + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../block/block_raking_layout.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../thread/thread_scan.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + */ +template < + typename T, ///< Data type being scanned + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanRaking +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), + }; + + /// WarpScan utility type + typedef WarpScan WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + T block_aggregate; ///< Block aggregate + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + T cached_segment[SEGMENT_LENGTH]; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /// Templated reduction + template + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type /*iteration*/) + { + if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) + { + T addend = raking_ptr[ITERATION]; + raking_partial = scan_op(raking_partial, addend); + } + + return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); + } + + + /// Templated reduction (base case) + template + __device__ __forceinline__ T GuardedReduce( + T* /*raking_ptr*/, ///< [in] Input array + ScanOp /*scan_op*/, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type /*iteration*/) + { + return raking_partial; + } + + + /// Templated copy + template + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type /*iteration*/) + { + out[ITERATION] = in[ITERATION]; + CopySegment(out, in, Int2Type()); + } + + + /// Templated copy (base case) + __device__ __forceinline__ void CopySegment( + T* /*out*/, ///< [out] Out array + T* /*in*/, ///< [in] Input array + Int2Type /*iteration*/) + {} + + + /// Performs upsweep raking reduction, returning the aggregate + template + __device__ __forceinline__ T Upsweep( + ScanOp scan_op) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data into registers + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + + T raking_partial = cached_segment[0]; + + return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); + } + + + /// Performs exclusive downsweep raking scan + template + __device__ __forceinline__ void ExclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Performs inclusive downsweep raking scan + template + __device__ __forceinline__ void InclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + exclusive_output = *placement_ptr; + } + } + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial); + } + + CTA_SYNC(); + + // Grab exclusive partial from shared memory + output = *placement_ptr; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial= Upsweep(scan_op); + + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial); + + // Broadcast aggregate to other threads + if (linear_tid == 0) + temp_storage.block_aggregate = block_aggregate; + } + + CTA_SYNC(); + + // Grab exclusive partial from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + output = scan_op(block_prefix, output); + if (linear_tid == 0) + output = block_prefix; + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); + + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, downsweep_prefix); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with exclusive warpscan partial + output = scan_op(block_prefix, output); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); + + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, downsweep_prefix); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans.cuh new file mode 100644 index 0000000000..85e4d6135a --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans.cuh @@ -0,0 +1,392 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScan; + + /// Shared memory storage layout type + + struct __align__(32) _TempStorage + { + T warp_aggregates[WARPS]; + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction + ScanOp /*scan_op*/, ///< [in] Binary scan operator + T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans2.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans2.cuh new file mode 100644 index 0000000000..4de7c69b70 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans2.cuh @@ -0,0 +1,436 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScanT; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T warp_aggregates[WARPS]; + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans3.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans3.cuh new file mode 100644 index 0000000000..147ca4c5af --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans3.cuh @@ -0,0 +1,418 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, + + /// Number of outer scan warps + OUTER_WARPS = INNER_WARP_THREADS + }; + + /// Outer WarpScan utility type + typedef WarpScan OuterWarpScanT; + + /// Inner WarpScan utility type + typedef WarpScan InnerWarpScanT; + + typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; + + + /// Shared memory storage layout type + struct _TempStorage + { + union Aliasable + { + Uninitialized outer_warp_scan; ///< Buffer for warp-synchronous outer scans + typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan + + } aliasable; + + T warp_aggregates[OUTER_WARPS]; + + T block_aggregate; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS), + lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + { + temp_storage.warp_aggregates[warp_id] = inclusive_output; + } + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial (or assign it if partial is invalid) + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/cub.cuh b/GraphBLAS/CUDA/local_cub/cub.cuh new file mode 100644 index 0000000000..3ece0f6584 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/cub.cuh @@ -0,0 +1,95 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * CUB umbrella include file + */ + +#pragma once + + +// Block +#include "block/block_histogram.cuh" +#include "block/block_discontinuity.cuh" +#include "block/block_exchange.cuh" +#include "block/block_load.cuh" +#include "block/block_radix_rank.cuh" +#include "block/block_radix_sort.cuh" +#include "block/block_reduce.cuh" +#include "block/block_scan.cuh" +#include "block/block_store.cuh" +//#include "block/block_shift.cuh" + +// Device +#include "device/device_histogram.cuh" +#include "device/device_partition.cuh" +#include "device/device_radix_sort.cuh" +#include "device/device_reduce.cuh" +#include "device/device_run_length_encode.cuh" +#include "device/device_scan.cuh" +#include "device/device_segmented_radix_sort.cuh" +#include "device/device_segmented_reduce.cuh" +#include "device/device_select.cuh" +#include "device/device_spmv.cuh" + +// Grid +//#include "grid/grid_barrier.cuh" +#include "grid/grid_even_share.cuh" +#include "grid/grid_mapping.cuh" +#include "grid/grid_queue.cuh" + +// Thread +#include "thread/thread_load.cuh" +#include "thread/thread_operators.cuh" +#include "thread/thread_reduce.cuh" +#include "thread/thread_scan.cuh" +#include "thread/thread_store.cuh" + +// Warp +#include "warp/warp_reduce.cuh" +#include "warp/warp_scan.cuh" + +// Iterator +#include "iterator/arg_index_input_iterator.cuh" +#include "iterator/cache_modified_input_iterator.cuh" +#include "iterator/cache_modified_output_iterator.cuh" +#include "iterator/constant_input_iterator.cuh" +#include "iterator/counting_input_iterator.cuh" +#include "iterator/tex_obj_input_iterator.cuh" +#include "iterator/tex_ref_input_iterator.cuh" +#include "iterator/transform_input_iterator.cuh" + +// Util +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_device.cuh" +#include "util_macro.cuh" +#include "util_ptx.cuh" +#include "util_type.cuh" + diff --git a/GraphBLAS/CUDA/local_cub/device/device_histogram.cuh b/GraphBLAS/CUDA/local_cub/device/device_histogram.cuh new file mode 100644 index 0000000000..a2556a6b85 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_histogram.cuh @@ -0,0 +1,866 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_histogram.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * + * \par Usage Considerations + * \cdp_class{DeviceHistogram} + * + */ +struct DeviceHistogram +{ + /******************************************************************//** + * \name Evenly-segmented bin ranges + *********************************************************************/ + //@{ + + /** + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. + * + * \par + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a sequence of float samples + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_samples, + 1, + sizeof(SampleT) * num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. + * + * \par + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * size_t row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_row_samples, + num_rows, + row_stride_bytes, + stream, + debug_synchronous); + } + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); + * + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramEven( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + stream, + debug_synchronous); + } + + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), + * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + + //@} end member group + /******************************************************************//** + * \name Custom bin ranges + *********************************************************************/ + //@{ + + /** + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + * + * \par + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of an six-bin histogram + * from a sequence of float samples + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + d_levels1, + num_samples, + 1, + sizeof(SampleT) * num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + * + * \par + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * int row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ , , , , , , , ] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + d_levels1, + num_row_samples, + num_rows, + row_stride_bytes, + stream, + debug_synchronous); + } + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), + * // (0, 6, 7, 5),(3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); + * + * // d_histogram <-- [ [1, 3, 0, 1], + * // [3, 0, 0, 2], + * // [0, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramRange( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + d_levels, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + stream, + debug_synchronous); + } + + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), + * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] + * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [2, 3, 0, 1], + * // [3, 0, 0, 2], + * // [1, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + + + //@} end member group +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_partition.cuh b/GraphBLAS/CUDA/local_cub/device/device_partition.cuh new file mode 100644 index 0000000000..5053540071 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_partition.cuh @@ -0,0 +1,273 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from + * a specified input sequence. + * + * \par Usage Considerations + * \cdp_class{DevicePartition} + * + * \par Performance + * \linear_performance{partition} + * + * \par + * The following chart illustrates DevicePartition::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected for the first partition. + * \plots_below + * + * \image html partition_if_int32_50_percent.png + * + */ +struct DevicePartition +{ + /** + * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated partition-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected for the first partition with 50% probability. + * + * \image html partition_if_int32_50_percent.png + * \image html partition_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability for the first partition: + * + * \image html partition_if_int32_5_percent.png + * \image html partition_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_partition_flagged.cu + * \example example_device_partition_if.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/device/device_radix_sort.cuh new file mode 100644 index 0000000000..1c0bdbea1d --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_radix_sort.cuh @@ -0,0 +1,797 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) + * \ingroup SingleModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceRadixSort} + * + * \par Performance + * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys + * performance across different CUDA architectures for uniform-random \p uint32 keys. + * \plots_below + * + * \image html lsb_radix_sort_int32_keys.png + * + */ +struct DeviceRadixSort +{ + + /******************************************************************//** + * \name KeyT-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +/** + * \example example_device_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_reduce.cuh b/GraphBLAS/CUDA/local_cub/device/device_reduce.cuh new file mode 100644 index 0000000000..13c7a72d1a --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_reduce.cuh @@ -0,0 +1,734 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceReduce} + * + * \par Performance + * \linear_performance{reduction, reduce-by-key, and run-length encode} + * + * \par + * The following chart illustrates DeviceReduce::Sum + * performance across different CUDA architectures for \p int32 keys. + * + * \image html reduce_int32.png + * + * \par + * The following chart illustrates DeviceReduce::ReduceByKey (summation) + * performance across different CUDA architectures for \p fp32 + * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceReduce +{ + /** + * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * __device__ __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * CustomMin min_op; + * int init; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + T init, ///< [in] Initial value of the reduction + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + init, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide sum using the addition (\p +) operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction. + * - Does not support \p + operators that are non-commutative.. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sum-reduction performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html reduce_int32.png + * \image html reduce_int64.png + * + * \par Snippet + * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction. + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // d_out <-- [{5, 0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // d_out <-- [9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // d_out <-- [{6, 9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. + * + * \par + * This operation computes segmented reductions within \p d_values_in using + * the specified binary \p reduction_op functor. The segments are identified by + * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of + * consecutive, identical keys. For the ith run encountered, + * the first key of the run and the corresponding value aggregate of that run are + * written to d_unique_out[i] and d_aggregates_out[i], + * respectively. The total number of runs encountered is written to \p d_num_runs_out. + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following chart illustrates reduction-by-key (sum) performance across + * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments + * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * \image html reduce_by_key_fp64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html reduce_by_key_fp32_len_5.png + * \image html reduce_by_key_fp64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the segmented reduction of \p int values grouped + * by runs of associated \p int keys. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] + * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_num_runs_out; // e.g., [-] + * CustomMin reduction_op; + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduce-by-key + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_aggregates_out <-- [0, 1, 6, 2, 4] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator + * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator + * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename ReductionOpT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t ReduceByKey( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // FlagT iterator type (not used) + + // Selection op (not used) + + // Default == operator + typedef Equality EqualityOp; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + EqualityOp(), + reduction_op, + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_run_length_encode.cuh b/GraphBLAS/CUDA/local_cub/device/device_run_length_encode.cuh new file mode 100644 index 0000000000..7a2e82d9d7 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_run_length_encode.cuh @@ -0,0 +1,278 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_rle.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A run-length encoding + * computes a simple compressed representation of a sequence of input elements such that each + * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a + * count of the elements in that run. + * + * \par Usage Considerations + * \cdp_class{DeviceRunLengthEncode} + * + * \par Performance + * \linear_performance{run-length encode} + * + * \par + * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across + * different CUDA architectures for \p int32 items. + * Segments have lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceRunLengthEncode +{ + + /** + * \brief Computes a run-length encoding of the sequence \p d_in. + * + * \par + * - For the ith run encountered, the first key of the run and its length are written to + * d_unique_out[i] and d_counts_out[i], + * respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated encode performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * \image html rle_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html rle_int32_len_5.png + * \image html rle_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the run-length encoding of a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_unique_out; // e.g., [ , , , , , , , ] + * int *d_counts_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_counts_out <-- [1, 2, 1, 3, 1] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output items \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing output counts \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename UniqueOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Encode( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + typedef cub::Sum ReductionOp; // Value reduction operator + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + // Generator type for providing 1s values for run-length reduction + typedef ConstantInputIterator LengthsInputIteratorT; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + LengthsInputIteratorT((LengthT) 1), + d_counts_out, + d_num_runs_out, + EqualityOp(), + ReductionOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in. + * + * \par + * - For the ith non-trivial run, the run's starting offset + * and its length are written to d_offsets_out[i] and + * d_lengths_out[i], respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * + * \par Snippet + * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_offsets_out; // e.g., [ , , , , , , , ] + * int *d_lengths_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // d_offsets_out <-- [1, 4] + * // d_lengths_out <-- [2, 3] + * // d_num_runs_out <-- [2] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OffsetsOutputIteratorT [inferred] Random-access output iterator type for writing run-offset values \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing run-length values \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename OffsetsOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t NonTrivialRuns( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run) + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef Equality EqualityOp; // Default == operator + + return DeviceRleDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_scan.cuh b/GraphBLAS/CUDA/local_cub/device/device_scan.cuh new file mode 100644 index 0000000000..e86fefe3cd --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_scan.cuh @@ -0,0 +1,443 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_scan.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) + * \ingroup SingleModule + * + * \par Overview + * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output sequence where each element is computed to be the reduction + * of the elements occurring earlier in the input sequence. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * + * \par + * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm + * for performing global prefix scan with only a single pass through the + * input data, as described in our 2016 technical report [1]. The central + * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies + * of global prefix propagation with local computation. As such, our algorithm requires only + * ~2n data movement (n inputs are read, n outputs are written), and typically + * proceeds at "memcpy" speeds. + * + * \par + * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) + * + * \par Usage Considerations + * \cdp_class{DeviceScan} + * + * \par Performance + * \linear_performance{prefix scan} + * + * \par + * The following chart illustrates DeviceScan::ExclusiveSum + * performance across different CUDA architectures for \p int32 keys. + * \plots_below + * + * \image html scan_int32.png + * + */ +struct DeviceScan +{ + /******************************************************************//** + * \name Exclusive scans + *********************************************************************/ + //@{ + + /** + * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated exclusive sum performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html scan_int32.png + * \image html scan_int64.png + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveSum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Initial value + OutputT init_value = 0; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + init_value, + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op + * ... + * + * // Determine temporary device storage requirements for exclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide inclusive prefix sum. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive prefix sum + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [8, 14, 21, 26, 29, 29, 38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveSum( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // d_out <-- [8, 6, 6, 5, 3, 0, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + + //@} end member group + +}; + +/** + * \example example_device_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_segmented_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/device/device_segmented_radix_sort.cuh new file mode 100644 index 0000000000..0d36076277 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_segmented_radix_sort.cuh @@ -0,0 +1,876 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedRadixSort} + * + */ +struct DeviceSegmentedRadixSort +{ + + /******************************************************************//** + * \name Key-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts segments of keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts segments of keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_segmented_reduce.cuh b/GraphBLAS/CUDA/local_cub/device/device_segmented_reduce.cuh new file mode 100644 index 0000000000..6c3b54a031 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_segmented_reduce.cuh @@ -0,0 +1,619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedReduce} + * + */ +struct DeviceSegmentedReduce +{ + /** + * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * CustomMin min_op; + * int initial_value; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT, + typename ReductionOp, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOp reduction_op, ///< [in] Binary reduction functor + T initial_value, ///< [in] Initial value of the reduction for each segment + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented sum using the addition ('+') operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p + operators that are non-commutative.. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [21, 0, 17] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [8, INT_MIN, 9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_select.cuh b/GraphBLAS/CUDA/local_cub/device/device_select.cuh new file mode 100644 index 0000000000..52a3e126da --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_select.cuh @@ -0,0 +1,369 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to selectively copy + * items from a specified input sequence to a compact output sequence. + * + * \par Usage Considerations + * \cdp_class{DeviceSelect} + * + * \par Performance + * \linear_performance{select-flagged, select-if, and select-unique} + * + * \par + * The following chart illustrates DeviceSelect::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected. + * + * \image html select_if_int32_50_percent.png + * + * \par + * The following chart illustrates DeviceSelect::Unique + * performance across different CUDA architectures for \p int32 items + * where segments have lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceSelect +{ + /** + * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected with 50% probability. + * + * \image html select_if_int32_50_percent.png + * \image html select_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability: + * + * \image html select_if_int32_5_percent.png + * \image html select_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-unique performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * \image html select_unique_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html select_unique_int32_len_5.png + * \image html select_unique_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [0, 2, 9, 5, 8] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Unique( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_select_flagged.cu + * \example example_device_select_if.cu + * \example example_device_select_unique.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/device_spmv.cuh b/GraphBLAS/CUDA/local_cub/device/device_spmv.cuh new file mode 100644 index 0000000000..63b6a7e86f --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/device_spmv.cuh @@ -0,0 +1,174 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_spmv_orig.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). + * \ingroup SingleModule + * + * \par Overview + * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) + * performs the matrix-vector operation + * y = alpha*A*x + beta*y, + * where: + * - A is an mxn sparse matrix whose non-zero structure is specified in + * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) + * (i.e., three arrays: values, row_offsets, and column_indices) + * - x and y are dense vectors + * - alpha and beta are scalar multiplicands + * + * \par Usage Considerations + * \cdp_class{DeviceSpmv} + * + */ +struct DeviceSpmv +{ + /******************************************************************//** + * \name CSR matrix operations + *********************************************************************/ + //@{ + + /** + * \brief This function performs the matrix-vector operation y = A*x. + * + * \par Snippet + * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A + * representing a 3x3 lattice (24 non-zeros). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, + * // and output vector y + * int num_rows = 9; + * int num_cols = 9; + * int num_nonzeros = 24; + * + * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1] + * + * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, + * // 4, 6, 1, 3, 5, 7, 2, 4, + * // 8, 3, 7, 4, 6, 8, 5, 7] + * + * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] + * + * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] + * float* d_vector_y; // e.g., [ , , , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run SpMV + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] + * + * \endcode + * + * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) + */ + template < + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t CsrMV( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) + int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows, ///< [in] number of rows of matrix A. + int num_cols, ///< [in] number of columns of matrix A. + int num_nonzeros, ///< [in] number of nonzero elements of matrix A. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + SpmvParams spmv_params; + spmv_params.d_values = d_values; + spmv_params.d_row_end_offsets = d_row_offsets + 1; + spmv_params.d_column_indices = d_column_indices; + spmv_params.d_vector_x = d_vector_x; + spmv_params.d_vector_y = d_vector_y; + spmv_params.num_rows = num_rows; + spmv_params.num_cols = num_cols; + spmv_params.num_nonzeros = num_nonzeros; + spmv_params.alpha = 1.0; + spmv_params.beta = 0.0; + + return DispatchSpmv::Dispatch( + d_temp_storage, + temp_storage_bytes, + spmv_params, + stream, + debug_synchronous); + } + + //@} end member group +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_histogram.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_histogram.cuh new file mode 100644 index 0000000000..ab08e8ed05 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_histogram.cuh @@ -0,0 +1,1096 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../../agent/agent_histogram.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Histogram kernel entry points + *****************************************************************************/ + +/** + * Histogram initialization kernel entry point + */ +template < + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename OffsetT> ///< Signed integer type for global offsets +__global__ void DeviceHistogramInitKernel( + ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel + ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + if ((threadIdx.x == 0) && (blockIdx.x == 0)) + tile_queue.ResetDrain(); + + int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; + + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + if (output_bin < num_output_bins_wrapper.array[CHANNEL]) + d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; + } +} + + +/** + * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< The input iterator type. \iterator. + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) +__global__ void DeviceHistogramSweepKernel( + SampleIteratorT d_samples, ///< Input data to reduce + ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram + ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram + ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms + ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms + ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for compositing input tiles + typedef AgentHistogram< + AgentHistogramPolicyT, + PRIVATIZED_SMEM_BINS, + NUM_CHANNELS, + NUM_ACTIVE_CHANNELS, + SampleIteratorT, + CounterT, + PrivatizedDecodeOpT, + OutputDecodeOpT, + OffsetT> + AgentHistogramT; + + // Shared memory for AgentHistogram + __shared__ typename AgentHistogramT::TempStorage temp_storage; + + AgentHistogramT agent( + temp_storage, + d_samples, + num_output_bins_wrapper.array, + num_privatized_bins_wrapper.array, + d_output_histograms_wrapper.array, + d_privatized_histograms_wrapper.array, + output_decode_op_wrapper.array, + privatized_decode_op_wrapper.array); + + // Initialize counters + agent.InitBinCounters(); + + // Consume input tiles + agent.ConsumeTiles( + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Store output to global (if necessary) + agent.StoreOutput(); + +} + + + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram + */ +template < + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename LevelT, ///< Type for specifying bin level boundaries + typename OffsetT> ///< Signed integer type for global offsets +struct DipatchHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + enum + { + // Maximum number of bins per channel for which we will use a privatized smem strategy + MAX_PRIVATIZED_SMEM_BINS = 256 + }; + + + //--------------------------------------------------------------------- + // Transform functors for converting samples to bin-ids + //--------------------------------------------------------------------- + + // Searches for bin given a list of bin-boundary levels + template + struct SearchTransform + { + LevelIteratorT d_levels; // Pointer to levels array + int num_output_levels; // Number of levels in array + + // Initializer + __host__ __device__ __forceinline__ void Init( + LevelIteratorT d_levels, // Pointer to levels array + int num_output_levels) // Number of levels in array + { + this->d_levels = d_levels; + this->num_output_levels = num_output_levels; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + /// Level iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + LevelIteratorT>::Type // Directly use the supplied input iterator type + WrappedLevelIteratorT; + + WrappedLevelIteratorT wrapped_levels(d_levels); + + int num_bins = num_output_levels - 1; + if (valid) + { + bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; + if (bin >= num_bins) + bin = -1; + } + } + }; + + + // Scales samples to evenly-spaced bins + struct ScaleTransform + { + int num_bins; // Number of levels in array + LevelT max; // Max sample level (exclusive) + LevelT min; // Min sample level (inclusive) + LevelT scale; // Bin scaling factor + + // Initializer + template + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + _LevelT max, // Max sample level (exclusive) + _LevelT min, // Min sample level (inclusive) + _LevelT scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = scale; + } + + // Initializer (float specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + float max, // Max sample level (exclusive) + float min, // Min sample level (inclusive) + float scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = float(1.0) / scale; + } + + // Initializer (double specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + double max, // Max sample level (exclusive) + double min, // Min sample level (inclusive) + double scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = double(1.0) / scale; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) / scale); + } + + // Method for converting samples to bin-ids (float specialization) + template + __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + + // Method for converting samples to bin-ids (double specialization) + template + __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + }; + + + // Pass-through bin transform operator + struct PassThruTransform + { + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + if (valid) + bin = (int) sample; + } + }; + + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + template + struct TScale + { + enum + { + V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), + VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) + }; + }; + + + /// SM11 + struct Policy110 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + (NUM_CHANNELS == 1) ? 256 : 128, + (NUM_CHANNELS == 1) ? 8 : 3, + (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM35 + struct Policy350 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 128, + TScale<8>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLEND, + true> + HistogramSweepPolicy; + }; + + /// SM50 + struct Policy500 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 384, + TScale<16>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int ptx_version, + KernelConfig &histogram_sweep_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + return histogram_sweep_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 350) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 300) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 200) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 110) + { + return histogram_sweep_config.template Init(); + } + else + { + // No global atomic support + return cudaErrorNotSupported; + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int pixels_per_thread; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; + + return cudaSuccess; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Privatization-based dispatch routine + */ + template < + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel + typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t PrivatizedDispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int max_num_output_bins, ///< [in] Maximum number of output bins in any channel + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel + DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel + KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + #ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + + #else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get SM occupancy for histogram_sweep_kernel + int histogram_sweep_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + histogram_sweep_sm_occupancy, + histogram_sweep_kernel, + histogram_sweep_config.block_threads))) break; + + // Get device occupancy for histogram_sweep_kernel + int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; + + if (num_row_pixels * NUM_CHANNELS == row_stride_samples) + { + // Treat as a single linear array of samples + num_row_pixels *= num_rows; + num_rows = 1; + row_stride_samples = num_row_pixels * NUM_CHANNELS; + } + + // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy + int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; + int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; + int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); + int blocks_per_col = (blocks_per_row > 0) ? + int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : + 0; + int num_thread_blocks = blocks_per_row * blocks_per_col; + + dim3 sweep_grid_dims; + sweep_grid_dims.x = (unsigned int) blocks_per_row; + sweep_grid_dims.y = (unsigned int) blocks_per_col; + sweep_grid_dims.z = 1; + + // Temporary storage allocation requirements + const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; + void* allocations[NUM_ALLOCATIONS]; + size_t allocation_sizes[NUM_ALLOCATIONS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); + + allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the grid queue descriptor + GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); + + // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_output_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; + + // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_privatized_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; + + // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper privatized_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; + + // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper output_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; + + // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_privatized_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; + + // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_output_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; + + int histogram_init_block_threads = 256; + int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; + + // Log DeviceHistogramInitKernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", + histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); + + // Invoke histogram_init_kernel + histogram_init_kernel<<>>( + num_output_bins_wrapper, + d_output_histograms_wrapper, + tile_queue); + + // Return if empty problem + if ((blocks_per_row == 0) || (blocks_per_col == 0)) + break; + + // Log histogram_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", + sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, + histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); + + // Invoke histogram_sweep_kernel + histogram_sweep_kernel<<>>( + d_samples, + num_output_bins_wrapper, + num_privatized_bins_wrapper, + d_output_histograms_wrapper, + d_privatized_histograms_wrapper, + output_decode_op_wrapper, + privatized_decode_op_wrapper, + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + + #endif // CUB_RUNTIME_ENABLED + } + + + + /** + * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the search transform op for converting samples to privatized bins + typedef SearchTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + // Dispatch + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Too many bins to keep in shared memory. + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the search transform op for converting privatized bins to output bins + typedef SearchTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; // Maximum number of levels in any channel + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the scale transform op for converting samples to privatized bins + typedef ScaleTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + + privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + } + while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the scale transform op for converting privatized bins to output bins + typedef ScaleTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } + while (0); + + return error; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_radix_sort.cuh new file mode 100644 index 0000000000..d1a992d438 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_radix_sort.cuh @@ -0,0 +1,1619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_radix_sort_upsweep.cuh" +#include "../../agent/agent_radix_sort_downsweep.cuh" +#include "../../agent/agent_scan.cuh" +#include "../../block/block_radix_sort.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortUpsweepKernel( + const KeyT *d_keys, ///< [in] Input keys buffer + OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT /*num_items*/, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortUpsweep type for the current configuration + typedef AgentRadixSortUpsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, + typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type, + KeyT, + OffsetT> + AgentRadixSortUpsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; + + // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block + even_share.template BlockInit(); + + AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); + + upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); + + CTA_SYNC(); + + // Write out digit counts (striped) + upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); +} + + +/** + * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortScanBinsKernel( + OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + int num_counts) ///< [in] Total number of bin-counts +{ + // Parameterize the AgentScan type for the current configuration + typedef AgentScan< + typename ChainedPolicyT::ActivePolicy::ScanPolicy, + OffsetT*, + OffsetT*, + cub::Sum, + OffsetT, + OffsetT> + AgentScanT; + + // Shared memory storage + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Block scan instance + AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; + + // Process full input tiles + int block_offset = 0; + BlockScanRunningPrefixOp prefix_op(0, Sum()); + while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) + { + block_scan.template ConsumeTile(block_offset, prefix_op); + block_offset += AgentScanT::TILE_ITEMS; + } +} + + +/** + * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortDownsweepKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortDownsweep type for the current configuration + typedef AgentRadixSortDownsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, + typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type, + IS_DESCENDING, + KeyT, + ValueT, + OffsetT> + AgentRadixSortDownsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.template BlockInit(); + + // Process input tiles + AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end); +} + + +/** + * Single pass kernel entry point (single-block). Fully sorts a tile of input. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceRadixSortSingleTileKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison +{ + // Constants + enum + { + BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, + KEYS_ONLY = Equals::VALUE, + }; + + // BlockRadixSort type + typedef BlockRadixSort< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ValueT, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, + (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> + BlockRadixSortT; + + // BlockLoad type (keys) + typedef BlockLoad< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; + + // Unsigned word for key bits + typedef typename Traits::UnsignedBits UnsignedBitsT; + + // Shared memory storage + __shared__ union TempStorage + { + typename BlockRadixSortT::TempStorage sort; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + + } temp_storage; + + // Keys and values for the block + KeyT keys[ITEMS_PER_THREAD]; + ValueT values[ITEMS_PER_THREAD]; + + // Get default (min/max) value for out-of-bounds keys + UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; + KeyT default_key = reinterpret_cast(default_key_bits); + + // Load keys + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); + + CTA_SYNC(); + + // Load values + if (!KEYS_ONLY) + { + // Register pressure work-around: moving num_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + num_items = ShuffleIndex(num_items, 0, 0xffffffff); + + BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); + + CTA_SYNC(); + } + + // Sort tile + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( + keys, + values, + current_bit, + end_bit, + Int2Type(), + Int2Type()); + + // Store keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; + if (item_offset < num_items) + { + d_keys_out[item_offset] = keys[ITEM]; + if (!KEYS_ONLY) + d_values_out[item_offset] = values[ITEM]; + } + } +} + + +/** + * Segmented radix sorting pass (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedRadixSortKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + int current_bit, ///< [in] Bit position of current radix digit + int pass_bits) ///< [in] Number of bits of current radix digit +{ + // + // Constants + // + + typedef typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, + typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT; + + enum + { + BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, + RADIX_BITS = SegmentedPolicyT::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Upsweep type + typedef AgentRadixSortUpsweep< + AgentRadixSortUpsweepPolicy, + KeyT, + OffsetT> + BlockUpsweepT; + + // Digit-scan type + typedef BlockScan DigitScanT; + + // Downsweep type + typedef AgentRadixSortDownsweep BlockDownsweepT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD + }; + + // + // Process input tiles + // + + // Shared memory storage + __shared__ union + { + typename BlockUpsweepT::TempStorage upsweep; + typename BlockDownsweepT::TempStorage downsweep; + struct + { + volatile OffsetT reverse_counts_in[RADIX_DIGITS]; + volatile OffsetT reverse_counts_out[RADIX_DIGITS]; + typename DigitScanT::TempStorage scan; + }; + + } temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + OffsetT num_items = segment_end - segment_begin; + + // Check if empty segment + if (num_items <= 0) + return; + + // Upsweep + BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); + upsweep.ProcessRegion(segment_begin, segment_end); + + CTA_SYNC(); + + // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) + OffsetT bin_count[BINS_TRACKED_PER_THREAD]; + upsweep.ExtractCounts(bin_count); + + CTA_SYNC(); + + if (IS_DESCENDING) + { + // Reverse bin counts + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; + } + } + + // Scan + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) + DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + bin_offset[track] += segment_begin; + } + + if (IS_DESCENDING) + { + // Reverse bin offsets + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; + } + } + + CTA_SYNC(); + + // Downsweep + BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); + downsweep.ProcessRegion(segment_begin, segment_end); +} + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +/** + * Tuning policy for kernel specialization + */ +template < + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + // Dominant-sized key/value type + typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT; + + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) + }; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef DownsweepPolicyKeys DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + + + }; + + + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM60 (GP100) + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + + }; + + + /// SM61 (GP104) + struct Policy610 : ChainedPolicy<610, Policy610, Policy600> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef AgentRadixSortUpsweepPolicy UpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM62 (Tegra, less RF) + struct Policy620 : ChainedPolicy<620, Policy620, Policy610> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM70 (GV100) + struct Policy700 : ChainedPolicy<700, Policy700, Policy620> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// MaxPolicy + typedef Policy700 MaxPolicy; + + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + begin_bit(begin_bit), + end_bit(end_bit), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version), + is_overwrite_okay(is_overwrite_okay) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block to sort in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Log single_tile_kernel configuration + if (debug_synchronous) + _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_keys.Current(), + d_keys.Alternate(), + d_values.Current(), + d_values.Alternate(), + num_items, + begin_bit, + end_bit); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update selector + d_keys.selector ^= 1; + d_values.selector ^= 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation + //------------------------------------------------------------------------------ + + /** + * Invoke a three-kernel sorting pass at the current bit. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + OffsetT *d_spine, + int spine_length, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log upsweep_kernel configuration + if (debug_synchronous) + _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, + pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + pass_config.upsweep_kernel<<>>( + d_keys_in, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); + + // Invoke scan_kernel + pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( + d_spine, + spine_length); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log downsweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, + pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); + + // Invoke downsweep_kernel + pass_config.downsweep_kernel<<>>( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + + /// Pass configuration structure + template < + typename UpsweepKernelT, + typename ScanKernelT, + typename DownsweepKernelT> + struct PassConfig + { + UpsweepKernelT upsweep_kernel; + KernelConfig upsweep_config; + ScanKernelT scan_kernel; + KernelConfig scan_config; + DownsweepKernelT downsweep_kernel; + KernelConfig downsweep_config; + int radix_bits; + int radix_digits; + int max_downsweep_grid_size; + GridEvenShare even_share; + + /// Initialize pass configuration + template < + typename UpsweepPolicyT, + typename ScanPolicyT, + typename DownsweepPolicyT> + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig( + UpsweepKernelT upsweep_kernel, + ScanKernelT scan_kernel, + DownsweepKernelT downsweep_kernel, + int ptx_version, + int sm_count, + int num_items) + { + cudaError error = cudaSuccess; + do + { + this->upsweep_kernel = upsweep_kernel; + this->scan_kernel = scan_kernel; + this->downsweep_kernel = downsweep_kernel; + radix_bits = DownsweepPolicyT::RADIX_BITS; + radix_digits = 1 << radix_bits; + + if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; + if (CubDebug(error = scan_config.Init(scan_kernel))) break; + if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; + + max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); + + even_share.DispatchInit( + num_items, + max_downsweep_grid_size, + CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + + } + while (0); + return error; + } + + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename ScanKernelT, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)upsweep_kernel; + (void)alt_upsweep_kernel; + (void)scan_kernel; + (void)downsweep_kernel; + (void)alt_downsweep_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular and alternate-digit kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig< + typename ActivePolicyT::UpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::DownsweepPolicy>( + upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; + + if ((error = alt_pass_config.template InitPassConfig< + typename ActivePolicyT::AltUpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::AltDownsweepPolicy>( + alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; + + // Get maximum spine length + int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); + int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[3]; + size_t allocation_sizes[3] = + { + spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); + + // Alias the temporary storage allocations + OffsetT *d_spine = static_cast(allocations[0]); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; + + // Invert selectors + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceRadixSortSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, + DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, + RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items, ///< [in] Number of items to sort + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSegmentedRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Parameter members + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructors + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + begin_bit(begin_bit), + end_bit(end_bit), + is_overwrite_okay(is_overwrite_okay), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Multi-segment invocation + //------------------------------------------------------------------------------ + + /// Invoke a three-kernel sorting pass at the current bit. + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log kernel configuration + if (debug_synchronous) + _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + num_segments, pass_config.segmented_config.block_threads, (long long) stream, + pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); + + pass_config.segmented_kernel<<>>( + d_keys_in, d_keys_out, + d_values_in, d_values_out, + d_begin_offsets, d_end_offsets, num_segments, + current_bit, pass_bits); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + /// PassConfig data structure + template + struct PassConfig + { + SegmentedKernelT segmented_kernel; + KernelConfig segmented_config; + int radix_bits; + int radix_digits; + + /// Initialize pass configuration + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) + { + this->segmented_kernel = segmented_kernel; + this->radix_bits = SegmentedPolicyT::RADIX_BITS; + this->radix_digits = 1 << radix_bits; + + return CubDebug(segmented_config.Init(segmented_kernel)); + } + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_kernel; + (void)alt_segmented_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Init regular and alternate kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; + if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + if (temp_storage_bytes == 0) + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; + int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + radix_bits - 1) / radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Invert selectors and update current bit + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedRadixSortKernel, + DeviceSegmentedRadixSortKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + + /// Internal dispatch routine + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce.cuh new file mode 100644 index 0000000000..e9d1b7ac17 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce.cuh @@ -0,0 +1,882 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_reduce.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + ReductionOpT reduction_op) ///< [in] Binary reduction functor +{ + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); + + // Output result + if (threadIdx.x == 0) + d_out[blockIdx.x] = block_aggregate; +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceReduceSingleTileKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OuputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) + *d_out = init; + return; + } + + // Consume input tiles + OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + OffsetT(0), + num_items); + + // Output result + if (threadIdx.x == 0) + *d_out = reduction_op(init, block_aggregate); +} + + +/// Normalize input iterator to segment offset +template +__device__ __forceinline__ +void NormalizeReductionOutput( + T &/*val*/, + OffsetT /*base_offset*/, + IteratorT /*itr*/) +{} + + +/// Normalize input iterator to segment offset (specialized for arg-index) +template +__device__ __forceinline__ +void NormalizeReductionOutput( + KeyValuePairT &val, + OffsetT base_offset, + ArgIndexInputIterator /*itr*/) +{ + val.key -= base_offset; +} + + +/** + * Segmented reduction (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + + // Check if empty problem + if (segment_begin == segment_end) + { + if (threadIdx.x == 0) + d_out[blockIdx.x] = init; + return; + } + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + segment_begin, + segment_end); + + // Normalize as needed + NormalizeReductionOutput(block_aggregate, segment_begin, d_in); + + if (threadIdx.x == 0) + d_out[blockIdx.x] = reduction_op(init, block_aggregate);; +} + + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template < + typename OuputT, ///< Data type + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DeviceReducePolicy +{ + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM13 + struct Policy130 : ChainedPolicy<130, Policy130, Policy130> + { + // ReducePolicy + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + { + // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + // ReducePolicy (GTX670: 154.0 @ 48M 4B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + /// SM60 + struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 16, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// MaxPolicy + typedef Policy600 MaxPolicy; + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchReduce : + DeviceReducePolicy< + typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + // Data type of output iterator + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_items(num_items), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block block to reduce in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke single_reduce_sweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_in, + d_out, + num_items, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation (two-pass) + //------------------------------------------------------------------------------ + + /// Invoke two-passes to reduce + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void) reduce_kernel; + (void) single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular kernel configuration + KernelConfig reduce_config; + if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; + int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); + GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); + + // Temporary storage allocation requirements + void* allocations[1]; + size_t allocation_sizes[1] = + { + max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + OutputT *d_block_reductions = (OutputT*) allocations[0]; + + // Get grid size for device_reduce_sweep_kernel + int reduce_grid_size = even_share.grid_size; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_grid_size, + ActivePolicyT::ReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, + reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + reduce_kernel<<>>( + d_in, + d_block_reductions, + num_items, + even_share, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke DeviceReduceSingleTileKernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_block_reductions, + d_out, + reduce_grid_size, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceReduceSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceReduceKernel, + DeviceReduceSingleTileKernel); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, num_items, reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchSegmentedReduce : + DeviceReducePolicy< + typename std::iterator_traits::value_type, + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_reduce_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Init kernel configuration + KernelConfig segmented_reduce_config; + if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + num_segments, + ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, + segmented_reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + segmented_reduce_kernel<<>>( + d_in, + d_out, + d_begin_offsets, + d_end_offsets, + num_segments, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedReduceKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + if (num_segments <= 0) + return cudaSuccess; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, + num_segments, d_begin_offsets, d_end_offsets, + reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce_by_key.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce_by_key.cuh new file mode 100644 index 0000000000..6f4837b7f8 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -0,0 +1,554 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_reduce_by_key.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicyT tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) +__global__ void DeviceReduceByKeyKernel( + KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op, ///< ValueT reduction operator + OffsetT num_items) ///< Total number of items to select from +{ + // Thread block type for reducing tiles of value segments + typedef AgentReduceByKey< + AgentReduceByKeyPolicyT, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT> + AgentReduceByKeyT; + + // Shared memory for AgentReduceByKey + __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; + + // Process tiles + AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey + */ +template < + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchReduceByKey +{ + //------------------------------------------------------------------------- + // Types and constants + //------------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)), + COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT), + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + //------------------------------------------------------------------------- + // Tuning policies + //------------------------------------------------------------------------- + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM11 + struct Policy110 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + ReduceByKeyPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_by_key_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_config.template Init(); + } + else + { + reduce_by_key_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduce-by-key using the + * specified kernel functions. + */ + template < + typename ScanInitKernelT, ///< Function type of cub::DeviceScanInitKernel + typename ReduceByKeyKernelT> ///< Function type of cub::DeviceReduceByKeyKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel + KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_keys_in; + (void)d_unique_out; + (void)d_values_in; + (void)d_aggregates_out; + (void)d_num_runs_out; + (void)equality_op; + (void)reduction_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)reduce_by_key_kernel; + (void)reduce_by_key_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for reduce_by_key_kernel + int reduce_by_key_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_by_key_sm_occupancy, // out + reduce_by_key_kernel, + reduce_by_key_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log reduce_by_key_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); + + // Invoke reduce_by_key_kernel + reduce_by_key_kernel<<>>( + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + tile_state, + start_tile, + equality_op, + reduction_op, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_by_key_config; + InitConfigs(ptx_version, reduce_by_key_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + equality_op, + reduction_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceReduceByKeyKernel, + reduce_by_key_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_rle.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_rle.cuh new file mode 100644 index 0000000000..98c3681f0a --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_rle.cuh @@ -0,0 +1,538 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_rle.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) +__global__ void DeviceRleSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentRle< + AgentRlePolicyT, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + EqualityOpT, + OffsetT> AgentRleT; + + // Shared memory for AgentRle + __shared__ typename AgentRleT::TempStorage temp_storage; + + // Process tiles + AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_runs_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceRle + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRleDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The input value type + typedef typename std::iterator_traits::value_type T; + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig& device_rle_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 300) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 200) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 130) + { + device_rle_config.template Init(); + } + else + { + device_rle_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool store_warp_time_slicing; + BlockScanAlgorithm scan_algorithm; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = AgentRlePolicyT::BLOCK_THREADS; + items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; + load_policy = AgentRlePolicyT::LOAD_ALGORITHM; + store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; + scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_warp_time_slicing, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide run-length-encode using the + * specified kernel functions. + */ + template < + typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel + typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel + KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log device_scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors + device_scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for device_rle_sweep_kernel + int device_rle_kernel_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + device_rle_kernel_sm_occupancy, // out + device_rle_sweep_kernel, + device_rle_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log device_rle_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); + + // Invoke device_rle_sweep_kernel + device_rle_sweep_kernel<<>>( + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + tile_status, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig device_rle_config; + InitConfigs(ptx_version, device_rle_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceRleSweepKernel, + device_rle_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_scan.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_scan.cuh new file mode 100644 index 0000000000..3ef720a446 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_scan.cuh @@ -0,0 +1,563 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_arch.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT> ///< Tile status interface type +__global__ void DeviceScanInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); +} + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT, ///< Tile status interface type + typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected +__global__ void DeviceCompactInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles, ///< [in] Number of tiles + NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); + + // Initialize d_num_selected_out + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + *d_num_selected_out = 0; +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanTileStateT, ///< Tile status interface type + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) +__global__ void DeviceScanKernel( + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + ScanOpT scan_op, ///< Binary scan functor + InitValueT init_value, ///< Initial value to seed the exclusive scan + OffsetT num_items) ///< Total number of scan items for the entire problem +{ + // Thread block type for scanning input tiles + typedef AgentScan< + ScanPolicyT, + InputIteratorT, + OutputIteratorT, + ScanOpT, + InitValueT, + OffsetT> AgentScanT; + + // Shared memory for AgentScan + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Process tiles + AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchScan +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM600 + struct Policy600 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 15, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM520 + struct Policy520 + { + // Titan X: 32.47B items/s @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM35 + struct Policy350 + { + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING> + ScanPolicyT; + }; + + /// SM30 + struct Policy300 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(256, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM20 + struct Policy200 + { + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM13 + struct Policy130 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(96, 21, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanPolicyT; + }; + + /// SM10 + struct Policy100 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(64, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 520) + typedef Policy520 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_kernel_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_kernel_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 520) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 350) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_kernel_config.template Init(); + } + else + { + scan_kernel_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename ScanSweepKernelPtrT> ///< Function type of cub::DeviceScanKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel + KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_out; + (void)scan_op; + (void)init_value; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)scan_kernel; + (void)scan_kernel_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_sm_occupancy, // out + scan_kernel, + scan_kernel_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); + + // Invoke scan_kernel + scan_kernel<<>>( + d_in, + d_out, + tile_state, + start_tile, + scan_op, + init_value, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Get kernel kernel dispatch configurations + KernelConfig scan_kernel_config; + InitConfigs(ptx_version, scan_kernel_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceScanInitKernel, + DeviceScanKernel, + scan_kernel_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_select_if.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_select_if.cuh new file mode 100644 index 0000000000..60b331338d --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_select_if.cuh @@ -0,0 +1,542 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_select_if.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename ScanTileStateT, ///< Tile status interface type + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) +__global__ void DeviceSelectSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentSelectIf< + AgentSelectIfPolicyT, + InputIteratorT, + FlagsInputIteratorT, + SelectedOutputIteratorT, + SelectOpT, + EqualityOpT, + OffsetT, + KEEP_REJECTS> AgentSelectIfT; + + // Shared memory for AgentSelectIf + __shared__ typename AgentSelectIfT::TempStorage temp_storage; + + // Process tiles + AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_selected_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct DispatchSelectIf +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 10, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectIfPolicyT; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + SelectIfPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &select_if_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_if_config.template Init(); + } + else if (ptx_version >= 300) + { + select_if_config.template Init(); + } + else if (ptx_version >= 200) + { + select_if_config.template Init(); + } + else if (ptx_version >= 130) + { + select_if_config.template Init(); + } + else + { + select_if_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide selection using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel + KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_flags; + (void)d_selected_out; + (void)d_num_selected_out; + (void)select_op; + (void)equality_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)scan_init_kernel; + (void)select_if_kernel; + (void)select_if_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke scan_init_kernel to initialize tile descriptors + scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_selected_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for select_if_kernel + int range_select_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + range_select_sm_occupancy, // out + select_if_kernel, + select_if_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log select_if_kernel configuration + if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); + + // Invoke select_if_kernel + select_if_kernel<<>>( + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + tile_status, + select_op, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig select_if_config; + InitConfigs(ptx_version, select_if_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + select_op, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceSelectSweepKernel, + select_if_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_spmv_orig.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_spmv_orig.cuh new file mode 100644 index 0000000000..ab9c5346d2 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_spmv_orig.cuh @@ -0,0 +1,834 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include + +#include "../../agent/single_pass_scan_operators.cuh" +#include "../../agent/agent_segment_fixup.cuh" +#include "../../agent/agent_spmv_orig.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * SpMV kernel entry points + *****************************************************************************/ + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +__global__ void DeviceSpmv1ColKernel( + SpmvParams spmv_params) ///< [in] SpMV input parameter bundle +{ + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); + + int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (row_idx < spmv_params.num_rows) + { + OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; + OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; + + ValueT value = 0.0; + if (end_nonzero_idx != nonzero_idx) + { + value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; + } + + spmv_params.d_vector_y[row_idx] = value; + } +} + + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + typename SpmvParamsT> ///< SpmvParams type +__global__ void DeviceSpmvSearchKernel( + int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) + CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates + SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle +{ + /// Constants + enum + { + BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + typedef CacheModifiedInputIterator< + SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + // Find the starting coordinate for all tiles (plus the end coordinate of the last one) + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_merge_tiles + 1) + { + OffsetT diagonal = (tile_idx * TILE_ITEMS); + CoordinateT tile_coordinate; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coordinate); + + // Output starting offset + d_tile_coordinates[tile_idx] = tile_coordinate; + } +} + + +/** + * Spmv agent entry point + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ScanTileStateT, ///< Tile status interface type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 + bool HAS_BETA> ///< Whether the input parameter Beta is 0 +__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) +__global__ void DeviceSpmvKernel( + SpmvParams spmv_params, ///< [in] SpMV input parameter bundle + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_tiles, ///< [in] Number of merge tiles + ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel + int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) +{ + // Spmv agent type specialization + typedef AgentSpmv< + SpmvPolicyT, + ValueT, + OffsetT, + HAS_ALPHA, + HAS_BETA> + AgentSpmvT; + + // Shared memory for AgentSpmv + __shared__ typename AgentSpmvT::TempStorage temp_storage; + + AgentSpmvT(temp_storage, spmv_params).ConsumeTile( + d_tile_coordinates, + d_tile_carry_pairs, + num_tiles); + + // Initialize fixup tile status + tile_state.InitializeStatus(num_segment_fixup_tiles); + +} + + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename OffsetT, ///< Signed integer type for global offsets + typename ScanTileStateT> ///< Tile status interface type +__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) +__global__ void DeviceSegmentFixupKernel( + PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block + AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates + OffsetT num_items, ///< [in] Total number of items to select from + int num_tiles, ///< [in] Total number of tiles for the entire problem + ScanTileStateT tile_state) ///< [in] Tile status interface +{ + // Thread block type for reducing tiles of value segments + typedef AgentSegmentFixup< + AgentSegmentFixupPolicyT, + PairsInputIteratorT, + AggregatesOutputIteratorT, + cub::Equality, + cub::Sum, + OffsetT> + AgentSegmentFixupT; + + // Shared memory for AgentSegmentFixup + __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; + + // Process tiles + AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( + num_items, + num_tiles, + tile_state); +} + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv + */ +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSpmv +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // SpmvParams bundle type + typedef SpmvParams SpmvParamsT; + + // 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM11 + struct Policy110 + { + typedef AgentSpmvPolicy< + 128, + 1, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM20 + struct Policy200 + { + typedef AgentSpmvPolicy< + 96, + 18, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_RAKING> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + + /// SM30 + struct Policy300 + { + typedef AgentSpmvPolicy< + 96, + 6, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + /// SM35 + struct Policy350 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 96 : 128, + (sizeof(ValueT) > 4) ? 4 : 7, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + /// SM37 + struct Policy370 + { + + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 128 : 128, + (sizeof(ValueT) > 4) ? 9 : 14, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM50 + struct Policy500 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 6 : 7, + LOAD_LDG, + LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_RAKING_MEMOIZE> + SegmentFixupPolicyT; + }; + + + /// SM60 + struct Policy600 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 5 : 7, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 370) + typedef Policy370 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; + struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &spmv_config, + KernelConfig &segment_fixup_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 500) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 370) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 350) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 300) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + + } + else if (ptx_version >= 200) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduction using the + * specified kernel functions. + * + * If the input is larger than a single tile, this method uses two-passes of + * kernel invocations. + */ + template < + typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel + typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel + typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel + typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel + SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel + SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel + SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel + KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for + KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + cudaError error = cudaSuccess; + do + { + if (spmv_params.num_cols == 1) + { + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + temp_storage_bytes = 1; + break; + } + + // Get search/init grid dims + int degen_col_kernel_block_size = INIT_KERNEL_THREADS; + int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; + + if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", + degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_1col_kernel<<>>( + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + break; + } + + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Total number of spmv work items + int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; + + // Tile sizes of kernels + int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; + int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; + + // Number of tiles for kernels + unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; + unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; + + // Get SM occupancy for kernels + int spmv_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + spmv_sm_occupancy, + spmv_kernel, + spmv_config.block_threads))) break; + + int segment_fixup_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + segment_fixup_sm_occupancy, + segment_fixup_kernel, + segment_fixup_config.block_threads))) break; + + // Get grid dimensions + dim3 spmv_grid_size( + CUB_MIN(num_merge_tiles, max_dim_x), + (num_merge_tiles + max_dim_x - 1) / max_dim_x, + 1); + + dim3 segment_fixup_grid_size( + CUB_MIN(num_segment_fixup_tiles, max_dim_x), + (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, + 1); + + // Get the temporary storage allocation requirements + size_t allocation_sizes[3]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors + allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs + allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + void* allocations[3]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; + + // Alias the other allocations + KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs + CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates + + // Get search/init grid dims + int search_block_size = INIT_KERNEL_THREADS; + int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; + +#if (CUB_PTX_ARCH == 0) + // Init textures + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; +#endif + + if (search_grid_size < sm_count) +// if (num_merge_tiles < spmv_sm_occupancy * sm_count) + { + // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords + d_tile_coordinates = NULL; + } + else + { + // Use separate search kernel if we have enough spmv tiles to saturate the device + + // Log spmv_search_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", + search_grid_size, search_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_search_kernel<<>>( + num_merge_tiles, + d_tile_coordinates, + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log spmv_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); + + // Invoke spmv_kernel + spmv_kernel<<>>( + spmv_params, + d_tile_coordinates, + d_tile_carry_pairs, + num_merge_tiles, + tile_state, + num_segment_fixup_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Run reduce-by-key fixup if necessary + if (num_merge_tiles > 1) + { + // Log segment_fixup_kernel configuration + if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); + + // Invoke segment_fixup_kernel + segment_fixup_kernel<<>>( + d_tile_carry_pairs, + spmv_params.d_vector_y, + num_merge_tiles, + num_segment_fixup_tiles, + tile_state); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + +#if (CUB_PTX_ARCH == 0) + // Free textures + if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; +#endif + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig spmv_config, segment_fixup_config; + InitConfigs(ptx_version, spmv_config, segment_fixup_config); + + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmv1ColKernel, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_barrier.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_barrier.cuh new file mode 100644 index 0000000000..461fb44216 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/grid/grid_barrier.cuh @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ + +#pragma once + +#include "../util_debug.cuh" +#include "../util_namespace.cuh" +#include "../thread/thread_load.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ +class GridBarrier +{ +protected : + + typedef unsigned int SyncFlag; + + // Counters in global device memory + SyncFlag* d_sync; + +public: + + /** + * Constructor + */ + GridBarrier() : d_sync(NULL) {} + + + /** + * Synchronize + */ + __device__ __forceinline__ void Sync() const + { + volatile SyncFlag *d_vol_sync = d_sync; + + // Threadfence and syncthreads to make sure global writes are visible before + // thread-0 reports in with its sync counter + __threadfence(); + CTA_SYNC(); + + if (blockIdx.x == 0) + { + // Report in ourselves + if (threadIdx.x == 0) + { + d_vol_sync[blockIdx.x] = 1; + } + + CTA_SYNC(); + + // Wait for everyone else to report in + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + while (ThreadLoad(d_sync + peer_block) == 0) + { + __threadfence_block(); + } + } + + CTA_SYNC(); + + // Let everyone know it's safe to proceed + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + d_vol_sync[peer_block] = 0; + } + } + else + { + if (threadIdx.x == 0) + { + // Report in + d_vol_sync[blockIdx.x] = 1; + + // Wait for acknowledgment + while (ThreadLoad(d_sync + blockIdx.x) == 1) + { + __threadfence_block(); + } + } + + CTA_SYNC(); + } + } +}; + + +/** + * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. + * + * Uses RAII for lifetime, i.e., device resources are reclaimed when + * the destructor is called. + */ +class GridBarrierLifetime : public GridBarrier +{ +protected: + + // Number of bytes backed by d_sync + size_t sync_bytes; + +public: + + /** + * Constructor + */ + GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} + + + /** + * DeviceFrees and resets the progress counters + */ + cudaError_t HostReset() + { + cudaError_t retval = cudaSuccess; + if (d_sync) + { + CubDebug(retval = cudaFree(d_sync)); + d_sync = NULL; + } + sync_bytes = 0; + return retval; + } + + + /** + * Destructor + */ + virtual ~GridBarrierLifetime() + { + HostReset(); + } + + + /** + * Sets up the progress counters for the next kernel launch (lazily + * allocating and initializing them if necessary) + */ + cudaError_t Setup(int sweep_grid_size) + { + cudaError_t retval = cudaSuccess; + do { + size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); + if (new_sync_bytes > sync_bytes) + { + if (d_sync) + { + if (CubDebug(retval = cudaFree(d_sync))) break; + } + + sync_bytes = new_sync_bytes; + + // Allocate and initialize to zero + if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; + if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; + } + } while (0); + + return retval; + } +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_even_share.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_even_share.cuh new file mode 100644 index 0000000000..f0b3a69ae0 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/grid/grid_even_share.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). + */ + + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" +#include "grid_mapping.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridEvenShare is a descriptor utility for distributing input among + * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly + * the same number of input tiles. + * + * \par Overview + * Each thread block is assigned a consecutive sequence of input tiles. To help + * preserve alignment and eliminate the overhead of guarded loads for all but the + * last thread block, to GridEvenShare assigns one of three different amounts of + * work to a given thread block: "big", "normal", or "last". The "big" workloads + * are one scheduling grain larger than "normal". The "last" work unit for the + * last thread block may be partially-full if the input is not an even multiple of + * the scheduling grain size. + * + * \par + * Before invoking a child grid, a parent thread will typically construct an + * instance of GridEvenShare. The instance can be passed to child thread blocks + * which can initialize their per-thread block offsets using \p BlockInit(). + */ +template +struct GridEvenShare +{ +private: + + OffsetT total_tiles; + int big_shares; + OffsetT big_share_items; + OffsetT normal_share_items; + OffsetT normal_base_offset; + +public: + + /// Total number of input items + OffsetT num_items; + + /// Grid size in thread blocks + int grid_size; + + /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles + OffsetT block_offset; + + /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles + OffsetT block_end; + + /// Stride between input tiles + OffsetT block_stride; + + + /** + * \brief Constructor. + */ + __host__ __device__ __forceinline__ GridEvenShare() : + total_tiles(0), + big_shares(0), + big_share_items(0), + normal_share_items(0), + normal_base_offset(0), + num_items(0), + grid_size(0), + block_offset(0), + block_end(0), + block_stride(0) + {} + + + /** + * \brief Dispatch initializer. To be called prior prior to kernel launch. + */ + __host__ __device__ __forceinline__ void DispatchInit( + OffsetT num_items, ///< Total number of input items + int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) + int tile_items) ///< Number of data items per input tile + { + this->block_offset = num_items; // Initialize past-the-end + this->block_end = num_items; // Initialize past-the-end + this->num_items = num_items; + this->total_tiles = (num_items + tile_items - 1) / tile_items; + this->grid_size = CUB_MIN(total_tiles, max_grid_size); + OffsetT avg_tiles_per_block = total_tiles / grid_size; + this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks + this->normal_share_items = avg_tiles_per_block * tile_items; + this->normal_base_offset = big_shares * tile_items; + this->big_share_items = normal_share_items + tile_items; + } + + + /** + * \brief Initializes ranges for the specified thread block index. Specialized + * for a "raking" access pattern in which each thread block is assigned a + * consecutive sequence of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = TILE_ITEMS; + if (block_id < big_shares) + { + // This thread block gets a big share of grains (avg_tiles_per_block + 1) + block_offset = (block_id * big_share_items); + block_end = block_offset + big_share_items; + } + else if (block_id < total_tiles) + { + // This thread block gets a normal share of grains (avg_tiles_per_block) + block_offset = normal_base_offset + (block_id * normal_share_items); + block_end = CUB_MIN(num_items, block_offset + normal_share_items); + } + // Else default past-the-end + } + + + /** + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = grid_size * TILE_ITEMS; + block_offset = (block_id * TILE_ITEMS); + block_end = num_items; + } + + + /** + * \brief Block-initialization, specialized for "strip mining" access + * pattern in which the input tiles assigned to each thread block are + * separated by a stride equal to the the extent of the grid. + */ + template < + int TILE_ITEMS, + GridMappingStrategy STRATEGY> + __device__ __forceinline__ void BlockInit() + { + BlockInit(blockIdx.x, Int2Type()); + } + + + /** + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + { + this->block_offset = block_offset; + this->block_end = block_end; + this->block_stride = TILE_ITEMS; + } + + +}; + + + + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_mapping.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_mapping.cuh new file mode 100644 index 0000000000..f0e9fded26 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/grid/grid_mapping.cuh @@ -0,0 +1,113 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/****************************************************************************** + * Mapping policies + *****************************************************************************/ + + +/** + * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ +enum GridMappingStrategy +{ + /** + * \brief An a "raking" access pattern in which each thread block is + * assigned a consecutive sequence of input tiles + * + * \par Overview + * The input is evenly partitioned into \p p segments, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each segment is comprised of + * consecutive tiles, where a tile is a small, constant-sized unit of input + * to be processed to completion before the thread block terminates or + * obtains more work. The kernel invokes \p p thread blocks, each + * of which iteratively consumes a segment of n/p elements + * in tile-size increments. + */ + GRID_MAPPING_RAKE, + + /** + * \brief An a "strip mining" access pattern in which the input tiles assigned + * to each thread block are separated by a stride equal to the the extent of + * the grid. + * + * \par Overview + * The input is evenly partitioned into \p p sets, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each set is comprised of + * data tiles separated by stride \p tiles, where a tile is a small, + * constant-sized unit of input to be processed to completion before the + * thread block terminates or obtains more work. The kernel invokes \p p + * thread blocks, each of which iteratively consumes a segment of + * n/p elements in tile-size increments. + */ + GRID_MAPPING_STRIP_MINE, + + /** + * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. + * + * \par Overview + * The input is treated as a queue to be dynamically consumed by a grid of + * thread blocks. Work is atomically dequeued in tiles, where a tile is a + * unit of input to be processed to completion before the thread block + * terminates or obtains more work. The grid size \p p is constant, + * loosely corresponding to the number of thread blocks that may actively + * reside on the target device. + */ + GRID_MAPPING_DYNAMIC, +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_queue.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_queue.cuh new file mode 100644 index 0000000000..9615b14dbe --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/grid/grid_queue.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridQueue is a descriptor utility for dynamic queue management. + */ + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_debug.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridQueue is a descriptor utility for dynamic queue management. + * + * \par Overview + * GridQueue descriptors provides abstractions for "filling" or + * "draining" globally-shared vectors. + * + * \par + * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, + * returning a unique offset for the calling thread to write its items. + * The GridQueue maintains the total "fill-size". The fill counter must be reset + * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that + * will be filling. + * + * \par + * Similarly, a "draining" GridQueue works by works by atomically-incrementing a + * zero-initialized counter, returning a unique offset for the calling thread to + * read its items. Threads can safely drain until the array's logical fill-size is + * exceeded. The drain counter must be reset using GridQueue::ResetDrain or + * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that + * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size + * is simply the number of elements in the array.) + * + * \par + * Iterative work management can be implemented simply with a pair of flip-flopping + * work buffers, each with an associated set of fill and drain GridQueue descriptors. + * + * \tparam OffsetT Signed integer type for global offsets + */ +template +class GridQueue +{ +private: + + /// Counter indices + enum + { + FILL = 0, + DRAIN = 1, + }; + + /// Pair of counters + OffsetT *d_counters; + +public: + + /// Returns the device allocation size in bytes needed to construct a GridQueue instance + __host__ __device__ __forceinline__ + static size_t AllocationSize() + { + return sizeof(OffsetT) * 2; + } + + + /// Constructs an invalid GridQueue descriptor + __host__ __device__ __forceinline__ GridQueue() + : + d_counters(NULL) + {} + + + /// Constructs a GridQueue descriptor around the device storage allocation + __host__ __device__ __forceinline__ GridQueue( + void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). + : + d_counters((OffsetT*) d_storage) + {} + + + /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( + OffsetT fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[FILL] = fill_size; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + OffsetT counters[2]; + counters[FILL] = fill_size; + counters[DRAIN] = 0; + return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); +#endif + } + + + /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); +#endif + } + + + /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. + __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[FILL] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); +#endif + } + + + /// Returns the fill-size established by the parent or by the previous kernel. + __host__ __device__ __forceinline__ cudaError_t FillSize( + OffsetT &fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + fill_size = d_counters[FILL]; + return cudaSuccess; +#else + return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); +#endif + } + + + /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Drain(OffsetT num_items) + { + return atomicAdd(d_counters + DRAIN, num_items); + } + + + /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Fill(OffsetT num_items) + { + return atomicAdd(d_counters + FILL, num_items); + } +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Reset grid queue (call with 1 block of 1 thread) + */ +template +__global__ void FillAndResetDrainKernel( + GridQueue grid_queue, + OffsetT num_items) +{ + grid_queue.FillAndResetDrain(num_items); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/GraphBLAS/CUDA/local_cub/host/mutex.cuh b/GraphBLAS/CUDA/local_cub/host/mutex.cuh new file mode 100644 index 0000000000..ff7ec90ddc --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/host/mutex.cuh @@ -0,0 +1,171 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple portable mutex + */ + + +#pragma once + +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + #include +#else + #if defined(_WIN32) || defined(_WIN64) + #include + + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #undef WIN32_LEAN_AND_MEAN + #undef NOMINMAX + + /** + * Compiler read/write barrier + */ + #pragma intrinsic(_ReadWriteBarrier) + + #endif +#endif + +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Simple portable mutex + * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) + * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) + */ +struct Mutex +{ +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + + std::mutex mtx; + + void Lock() + { + mtx.lock(); + } + + void Unlock() + { + mtx.unlock(); + } + + void TryLock() + { + mtx.try_lock(); + } + +#else //__cplusplus > 199711L + + #if defined(_MSC_VER) + + // Microsoft VC++ + typedef long Spinlock; + + #else + + // GNU g++ + typedef int Spinlock; + + /** + * Compiler read/write barrier + */ + __forceinline__ void _ReadWriteBarrier() + { + __sync_synchronize(); + } + + /** + * Atomic exchange + */ + __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) + { + // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier + _ReadWriteBarrier(); + return __sync_lock_test_and_set(Target, Value); + } + + /** + * Pause instruction to prevent excess processor bus usage + */ + __forceinline__ void YieldProcessor() + { + } + + #endif // defined(_MSC_VER) + + /// Lock member + volatile Spinlock lock; + + /** + * Constructor + */ + Mutex() : lock(0) {} + + /** + * Return when the specified spinlock has been acquired + */ + __forceinline__ void Lock() + { + while (1) + { + if (!_InterlockedExchange(&lock, 1)) return; + while (lock) YieldProcessor(); + } + } + + + /** + * Release the specified spinlock + */ + __forceinline__ void Unlock() + { + _ReadWriteBarrier(); + lock = 0; + } + +#endif // __cplusplus > 199711L + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/GraphBLAS/CUDA/local_cub/iterator/arg_index_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/arg_index_input_iterator.cuh new file mode 100644 index 0000000000..95a84a5797 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/arg_index_input_iterator.cuh @@ -0,0 +1,259 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#include + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). + * + * \par Overview + * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. + * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose + * \p key field is \p i and whose \p value field is itr[i]. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto + * dereference an array of doubles + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::ArgIndexInputIterator itr(d_in); + * + * // Within device code: + * typedef typename cub::ArgIndexInputIterator::value_type Tuple; + * Tuple item_offset_pair.key = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.key); // 8.0 @ 0 + * + * itr = itr + 6; + * item_offset_pair.key = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.key); // 9.0 @ 6 + * + * \endcode + * + * \tparam InputIteratorT The value type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) + */ +template < + typename InputIteratorT, + typename OffsetT = ptrdiff_t, + typename OutputValueT = typename std::iterator_traits::value_type> +class ArgIndexInputIterator +{ +public: + + // Required iterator traits + typedef ArgIndexInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef KeyValuePair value_type; ///< The type of the element the iterator can point to + typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to + typedef value_type reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + InputIteratorT itr; + difference_type offset; + +public: + + /// Constructor + __host__ __device__ __forceinline__ ArgIndexInputIterator( + InputIteratorT itr, ///< Input iterator to wrap + difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator + : + itr(itr), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + value_type retval; + retval.value = itr[offset]; + retval.key = offset; + return retval; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(itr, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(itr, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((itr == rhs.itr) && (offset == rhs.offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((itr != rhs.itr) || (offset != rhs.offset)); + } + + /// Normalize + __host__ __device__ __forceinline__ void normalize() + { + itr += offset; + offset = 0; + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/cache_modified_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_input_iterator.cuh new file mode 100644 index 0000000000..b4ad91e2f1 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_input_iterator.cuh @@ -0,0 +1,240 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. + * + * \par Overview + * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by reading \p ValueType values through loads modified by \p MODIFIER. + * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", + * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto + * dereference a device array of double using the "ldg" PTX load modifier + * (i.e., load values through texture cache). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::CacheModifiedInputIterator itr(d_in); + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * \endcode + * + * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename OffsetT = ptrdiff_t> +class CacheModifiedInputIterator +{ +public: + + // Required iterator traits + typedef CacheModifiedInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + + +public: + + /// Wrapped native pointer + ValueType* ptr; + + /// Constructor + template + __host__ __device__ __forceinline__ CacheModifiedInputIterator( + QualifiedValueType* ptr) ///< Native pointer to wrap + : + ptr(const_cast::Type *>(ptr)) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __device__ __forceinline__ reference operator*() const + { + return ThreadLoad(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __device__ __forceinline__ reference operator[](Distance n) const + { + return ThreadLoad(ptr + n); + } + + /// Structure dereference + __device__ __forceinline__ pointer operator->() + { + return &ThreadLoad(ptr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/cache_modified_output_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_output_iterator.cuh new file mode 100644 index 0000000000..c3e3321d30 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_output_iterator.cuh @@ -0,0 +1,254 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. + * + * \par Overview + * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by writing \p ValueType values through stores modified by \p MODIFIER. + * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", + * "STORE_CG", "STORE_CS", "STORE_WT", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to + * dereference a device array of doubles using the "wt" PTX load modifier + * (i.e., write-through to system memory). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_out; // e.g., [, , , , , , ] + * + * // Create an iterator wrapper + * cub::CacheModifiedOutputIterator itr(d_out); + * + * // Within device code: + * itr[0] = 8.0; + * itr[1] = 66.0; + * itr[55] = 24.0; + * + * \endcode + * + * \par Usage Considerations + * - Can only be dereferenced within device code + * + * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheStoreModifier MODIFIER, + typename ValueType, + typename OffsetT = ptrdiff_t> +class CacheModifiedOutputIterator +{ +private: + + // Proxy object + struct Reference + { + ValueType* ptr; + + /// Constructor + __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} + + /// Assignment + __device__ __forceinline__ ValueType operator =(ValueType val) + { + ThreadStore(ptr, val); + return val; + } + }; + +public: + + // Required iterator traits + typedef CacheModifiedOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef Reference reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType* ptr; + +public: + + /// Constructor + template + __host__ __device__ __forceinline__ CacheModifiedOutputIterator( + QualifiedValueType* ptr) ///< Native pointer to wrap + : + ptr(const_cast::Type *>(ptr)) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return Reference(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return Reference(ptr + n); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/constant_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/constant_input_iterator.cuh new file mode 100644 index 0000000000..1e0a91044d --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/constant_input_iterator.cuh @@ -0,0 +1,235 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input generator for dereferencing a sequence of homogeneous values + * + * \par Overview + * - Read references to a ConstantInputIteratorTiterator always return the supplied constant + * of type \p ValueType. + * - Can be used with any data type. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ConstantInputIteratorTto + * dereference a sequence of homogeneous doubles. + * \par + * \code + * #include // or equivalently + * + * cub::ConstantInputIterator itr(5.0); + * + * printf("%f\n", itr[0]); // 5.0 + * printf("%f\n", itr[1]); // 5.0 + * printf("%f\n", itr[2]); // 5.0 + * printf("%f\n", itr[50]); // 5.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename OffsetT = ptrdiff_t> +class ConstantInputIterator +{ +public: + + // Required iterator traits + typedef ConstantInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + OffsetT offset; +#ifdef _WIN32 + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ ConstantInputIterator( + ValueType val, ///< Starting value for the iterator instance to report + OffsetT offset = 0) ///< Base offset + : + val(val), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const + { + return val; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset) && ((val == rhs.val)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset) || (val!= rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "," << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/counting_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/counting_input_iterator.cuh new file mode 100644 index 0000000000..7f49348d6c --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/counting_input_iterator.cuh @@ -0,0 +1,228 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + +/** + * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. + * + * \par Overview + * - After initializing a CountingInputIteratorTto a certain integer \p base, read references + * at \p offset will return the value \p base + \p offset. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CountingInputIteratorTto + * dereference a sequence of incrementing integers. + * \par + * \code + * #include // or equivalently + * + * cub::CountingInputIterator itr(5); + * + * printf("%d\n", itr[0]); // 5 + * printf("%d\n", itr[1]); // 6 + * printf("%d\n", itr[2]); // 7 + * printf("%d\n", itr[50]); // 55 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename OffsetT = ptrdiff_t> +class CountingInputIterator +{ +public: + + // Required iterator traits + typedef CountingInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + +public: + + /// Constructor + __host__ __device__ __forceinline__ CountingInputIterator( + const ValueType &val) ///< Starting value for the iterator instance to report + : + val(val) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + val++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + val++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val + (ValueType) n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + val += (ValueType) n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val - (ValueType) n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + val -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return (difference_type) (val - other.val); + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return val + (ValueType) n; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (val == rhs.val); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (val != rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "]"; + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/discard_output_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/discard_output_iterator.cuh new file mode 100644 index 0000000000..28473e5f22 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/discard_output_iterator.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A discard iterator + */ +template +class DiscardOutputIterator +{ +public: + + // Required iterator traits + typedef DiscardOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef void reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + OffsetT offset; + +#if defined(_WIN32) || !defined(_WIN64) + // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ DiscardOutputIterator( + OffsetT offset = 0) ///< Base offset + : + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ self_type& operator*() + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ self_type& operator[](Distance n) + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return; + } + + /// Assignment to self (no-op) + __host__ __device__ __forceinline__ void operator=(self_type const& other) + { + offset = other.offset; + } + + /// Assignment to anything else (no-op) + template + __host__ __device__ __forceinline__ void operator=(T const&) + {} + + /// Cast to void* operator + __host__ __device__ __forceinline__ operator void*() const { return NULL; } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/tex_obj_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/tex_obj_input_iterator.cuh new file mode 100644 index 0000000000..b99103ec55 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/tex_obj_input_iterator.cuh @@ -0,0 +1,310 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. + * + * \par Overview + * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be + * created by the host thread, but can be used by any descendant kernel. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIteratorTto + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexObjInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + typename OffsetT = ptrdiff_t> +class TexObjInputIterator +{ +public: + + // Required iterator traits + typedef TexObjInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + // Largest texture word we can use in device + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + +private: + + T* ptr; + difference_type tex_offset; + cudaTextureObject_t tex_obj; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TexObjInputIterator() + : + ptr(NULL), + tex_offset(0), + tex_obj(0) + {} + + /// Use this iterator to bind \p ptr with a texture reference + template + cudaError_t BindTexture( + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + { + this->ptr = const_cast::Type *>(ptr); + this->tex_offset = tex_offset; + + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + cudaResourceDesc res_desc; + cudaTextureDesc tex_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = this->ptr; + res_desc.res.linear.desc = channel_desc; + res_desc.res.linear.sizeInBytes = bytes; + tex_desc.readMode = cudaReadModeElementType; + return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return cudaDestroyTextureObject(tex_obj); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Move array of uninitialized words, then alias and assign to return value + TextureWord words[TEXTURE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch( + tex_obj, + (tex_offset * TEXTURE_MULTIPLE) + i); + } + + // Load from words + return *reinterpret_cast(words); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/iterator/tex_ref_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/tex_ref_input_iterator.cuh new file mode 100644 index 0000000000..95d0ffbc96 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/tex_ref_input_iterator.cuh @@ -0,0 +1,374 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer + +#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Static file-scope Tesla/Fermi-style texture references + *****************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +// Anonymous namespace +namespace { + +/// Global texture reference specialized by type +template +struct IteratorTexRef +{ + /// And by unique ID + template + struct TexId + { + // Largest texture word we can use in device + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord), + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + + // Texture reference type + typedef texture TexRef; + + // Texture reference + static TexRef ref; + + /// Bind texture + static cudaError_t BindTexture(void *d_in, size_t &offset) + { + if (d_in) + { + cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); + ref.channelDesc = tex_desc; + return (CubDebug(cudaBindTexture(&offset, ref, d_in))); + } + + return cudaSuccess; + } + + /// Unbind texture + static cudaError_t UnbindTexture() + { + return CubDebug(cudaUnbindTexture(ref)); + } + + /// Fetch element + template + static __device__ __forceinline__ T Fetch(Distance tex_offset) + { + DeviceWord temp[DEVICE_MULTIPLE]; + TextureWord *words = reinterpret_cast(temp); + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i); + } + + return reinterpret_cast(temp); + } + }; +}; + +// Texture reference definitions +template +template +typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; + + +} // Anonymous namespace + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. + * + * \par Overview + * - TexRefInputIteratorTwraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture + * reference. Only one TexRefInputIteratorTinstance can be bound at any given time for a + * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host + * thread, and (4) compilation .o unit. + * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be + * created by the host thread and used by a top-level kernel (i.e. the one which is launched + * from the host). + * - Compatible with Thrust API v1.7 or newer. + * - Compatible with CUDA toolkit v5.5 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIteratorTto + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexRefInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + int UNIQUE_ID, + typename OffsetT = ptrdiff_t> +class TexRefInputIterator +{ +public: + + // Required iterator traits + typedef TexRefInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + T* ptr; + difference_type tex_offset; + + // Texture reference wrapper (old Tesla/Fermi-style textures) + typedef typename IteratorTexRef::template TexId TexId; + +public: +/* + /// Constructor + __host__ __device__ __forceinline__ TexRefInputIterator() + : + ptr(NULL), + tex_offset(0) + {} +*/ + /// Use this iterator to bind \p ptr with a texture reference + template + cudaError_t BindTexture( + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + { + this->ptr = const_cast::Type *>(ptr); + size_t offset; + cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); + this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); + return retval; + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return TexId::UnbindTexture(); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Use the texture reference + return TexId::Fetch(tex_offset); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + +#endif // CUDA_VERSION diff --git a/GraphBLAS/CUDA/local_cub/iterator/transform_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/transform_input_iterator.cuh new file mode 100644 index 0000000000..dad1f50041 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/iterator/transform_input_iterator.cuh @@ -0,0 +1,252 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for transforming dereferenced values. + * + * \par Overview + * - TransformInputIteratorTwraps a unary conversion functor of type \p + * ConversionOp and a random-access input iterator of type InputIteratorT, + * using the former to produce references of type \p ValueType from the latter. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TransformInputIteratorTto + * dereference an array of integers, tripling the values and converting them to doubles. + * \par + * \code + * #include // or equivalently + * + * // Functor for tripling integer values and converting to doubles + * struct TripleDoubler + * { + * __host__ __device__ __forceinline__ + * double operator()(const int &a) const { + * return double(a * 3); + * } + * }; + * + * // Declare, allocate, and initialize a device array + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * TripleDoubler conversion_op; + * + * // Create an iterator wrapper + * cub::TransformInputIterator itr(d_in, conversion_op); + * + * // Within device code: + * printf("%f\n", itr[0]); // 24.0 + * printf("%f\n", itr[1]); // 18.0 + * printf("%f\n", itr[6]); // 27.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). + * \tparam InputIteratorT The type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * + */ +template < + typename ValueType, + typename ConversionOp, + typename InputIteratorT, + typename OffsetT = ptrdiff_t> +class TransformInputIterator +{ +public: + + // Required iterator traits + typedef TransformInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ConversionOp conversion_op; + InputIteratorT input_itr; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TransformInputIterator( + InputIteratorT input_itr, ///< Input iterator to wrap + ConversionOp conversion_op) ///< Conversion functor to wrap + : + conversion_op(conversion_op), + input_itr(input_itr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + input_itr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + input_itr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return conversion_op(*input_itr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(input_itr + n, conversion_op); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + input_itr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(input_itr - n, conversion_op); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + input_itr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return input_itr - other.input_itr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return conversion_op(input_itr[n]); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &conversion_op(*input_itr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (input_itr == rhs.input_itr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (input_itr != rhs.input_itr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_load.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_load.cuh new file mode 100644 index 0000000000..b1ca412faf --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/thread/thread_load.cuh @@ -0,0 +1,438 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for reading memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory load operations. + */ +enum CacheLoadModifier +{ + LOAD_DEFAULT, ///< Default (no modifier) + LOAD_CA, ///< Cache at all levels + LOAD_CG, ///< Cache at global level + LOAD_CS, ///< Cache streaming (likely to be accessed once) + LOAD_CV, ///< Cache as volatile (including cached system lines) + LOAD_LDG, ///< Cache as texture + LOAD_VOLATILE, ///< Volatile (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit load using cache-global modifier: + * int *d_in; + * int val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 16-bit load using default modifier + * short *d_in; + * short val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 256-bit load using cache-volatile modifier + * double4 *d_in; + * double4 val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 96-bit load using cache-streaming modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); + * \endcode + * + * \tparam MODIFIER [inferred] CacheLoadModifier enumeration + * \tparam InputIteratorT [inferred] Input iterator type \iterator + */ +template < + CacheLoadModifier MODIFIER, + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated load iteration (inductive case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T const *ptr, T *vals) + { + vals[COUNT] = ThreadLoad(ptr + COUNT); + IterateThreadLoad::template Load(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) + { + vals[COUNT] = itr[COUNT]; + IterateThreadLoad::Dereference(itr, vals); + } +}; + + +/// Helper structure for templated load iteration (termination case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} + + template + static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} +}; + + +/** + * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ + { \ + uint4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ + "=r"(retval.x), \ + "=r"(retval.y), \ + "=r"(retval.z), \ + "=r"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ + { \ + ulonglong2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ + "=l"(retval.x), \ + "=l"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ + { \ + ushort4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ + "=h"(retval.x), \ + "=h"(retval.y), \ + "=h"(retval.z), \ + "=h"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ + { \ + uint2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ + "=r"(retval.x), \ + "=r"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ + { \ + unsigned long long retval; \ + asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ + "=l"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ + { \ + unsigned int retval; \ + asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ + "=r"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ + { \ + unsigned short retval; \ + asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ + { \ + unsigned short retval; \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " ld."#ptx_modifier".u8 datum, [%1];" \ + " cvt.u16.u8 %0, datum;" \ + "}" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return (unsigned char) retval; \ + } + + +/** + * Define powers-of-two ThreadLoad specializations for the given Cache load modifier + */ +#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ + _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + _CUB_LOAD_1(cub_modifier, ptx_modifier) \ + + +/** + * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + _CUB_LOAD_ALL(LOAD_CA, ca) + _CUB_LOAD_ALL(LOAD_CG, cg) + _CUB_LOAD_ALL(LOAD_CS, cs) + _CUB_LOAD_ALL(LOAD_CV, cv) +#else + _CUB_LOAD_ALL(LOAD_CA, global) + // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 + _CUB_LOAD_ALL(LOAD_CG, volatile.global) + _CUB_LOAD_ALL(LOAD_CS, global) + _CUB_LOAD_ALL(LOAD_CV, volatile.global) +#endif + +#if CUB_PTX_ARCH >= 350 + _CUB_LOAD_ALL(LOAD_LDG, global.nc) +#else + _CUB_LOAD_ALL(LOAD_LDG, global) +#endif + + +// Macro cleanup +#undef _CUB_LOAD_ALL +#undef _CUB_LOAD_1 +#undef _CUB_LOAD_2 +#undef _CUB_LOAD_4 +#undef _CUB_LOAD_8 +#undef _CUB_LOAD_16 + + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( + InputIteratorT itr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + return *itr; +} + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + return *ptr; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type /*is_primitive*/) +{ + T retval = *reinterpret_cast(ptr); + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type /*is_primitive*/) +{ + typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); +/* + VolatileWord words[VOLATILE_MULTIPLE]; + + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + + return *reinterpret_cast(words); +*/ + + T retval; + VolatileWord *words = reinterpret_cast(&retval); + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + // Apply tags for partial-specialization + return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadLoad definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T const *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + typedef typename UnitWord::DeviceWord DeviceWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( + reinterpret_cast(const_cast(ptr)), + words); + + return *reinterpret_cast(words); +} + + +/** + * ThreadLoad definition for generic modifiers + */ +template < + CacheLoadModifier MODIFIER, + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) +{ + // Apply tags for partial-specialization + return ThreadLoad( + itr, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_operators.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_operators.cuh new file mode 100644 index 0000000000..76cd800f58 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/thread/thread_operators.cuh @@ -0,0 +1,317 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple binary operator functor types + */ + +/****************************************************************************** + * Simple functor operators + ******************************************************************************/ + +#pragma once + +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \brief Default equality functor + */ +struct Equality +{ + /// Boolean equality operator, returns (a == b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a == b; + } +}; + + +/** + * \brief Default inequality functor + */ +struct Inequality +{ + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a != b; + } +}; + + +/** + * \brief Inequality functor (wraps equality functor) + */ +template +struct InequalityWrapper +{ + /// Wrapped equality operator + EqualityOp op; + + /// Constructor + __host__ __device__ __forceinline__ + InequalityWrapper(EqualityOp op) : op(op) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) + { + return !op(a, b); + } +}; + + +/** + * \brief Default sum functor + */ +struct Sum +{ + /// Boolean sum operator, returns a + b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return a + b; + } +}; + + +/** + * \brief Default max functor + */ +struct Max +{ + /// Boolean max operator, returns (a > b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MAX(a, b); + } +}; + + +/** + * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) + */ +struct ArgMax +{ + /// Boolean max operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const + { +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + + if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; + } +}; + + +/** + * \brief Default min functor + */ +struct Min +{ + /// Boolean min operator, returns (a < b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MIN(a, b); + } +}; + + +/** + * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) + */ +struct ArgMin +{ + /// Boolean min operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const + { +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + + if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; + } +}; + + +/** + * \brief Default cast functor + */ +template +struct CastOp +{ + /// Cast operator, returns (B) a + template + __host__ __device__ __forceinline__ B operator()(const A &a) const + { + return (B) a; + } +}; + + +/** + * \brief Binary operator wrapper for switching non-commutative scan arguments + */ +template +class SwizzleScanOp +{ +private: + + /// Wrapped scan operator + ScanOp scan_op; + +public: + + /// Constructor + __host__ __device__ __forceinline__ + SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} + + /// Switch the scan arguments + template + __host__ __device__ __forceinline__ + T operator()(const T &a, const T &b) + { + T _a(a); + T _b(b); + + return scan_op(_b, _a); + } +}; + + +/** + * \brief Reduce-by-segment functor. + * + * Given two cub::KeyValuePair inputs \p a and \p b and a + * binary associative combining operator \p f(const T &x, const T &y), + * an instance of this functor returns a cub::KeyValuePair whose \p key + * field is a.key + b.key, and whose \p value field + * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. + * + * ReduceBySegmentOp is an associative, non-commutative binary combining operator + * for input sequences of cub::KeyValuePair pairings. Such + * sequences are typically used to represent a segmented set of values to be reduced + * and a corresponding set of {0,1}-valued integer "head flags" demarcating the + * first value of each segment. + * + */ +template ///< Binary reduction operator to apply to values +struct ReduceBySegmentOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval; + retval.key = first.key + second.key; + retval.value = (second.key) ? + second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate + op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + return retval; + } +}; + + + +template ///< Binary reduction operator to apply to values +struct ReduceByKeyOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval = second; + + if (first.key == second.key) + retval.value = op(first.value, retval.value); + + return retval; + } +}; + + + + + + + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_reduce.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_reduce.cuh new file mode 100644 index 0000000000..4c13688f33 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/thread/thread_reduce.cuh @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential reduction over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + +/** + * Sequential reduction over statically-sized array types + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type /*length*/) +{ + T retval = prefix; + + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + retval = reduction_op(retval, input[i]); + + return retval; +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH LengthT of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + T prefix = input[0]; + return ThreadReduce(input + 1, reduction_op, prefix); +} + + +/** + * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Serial reduction with the specified operator + * + * \tparam LENGTH [inferred] LengthT of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + return ThreadReduce((T*) input, reduction_op); +} + + +} // internal namespace +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_scan.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_scan.cuh new file mode 100644 index 0000000000..8d67549ae8 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/thread/thread_scan.cuh @@ -0,0 +1,268 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential prefix scan over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential prefix scan over statically-sized array types + * @{ + */ + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*length*/) +{ + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(exclusive, input[i]); + output[i] = exclusive; + exclusive = inclusive; + } + + return inclusive; +} + + + +/** + * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = prefix; + T exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + + + + + + + + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*length*/) +{ + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(inclusive, input[i]); + output[i] = inclusive; + } + + return inclusive; +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + T inclusive = input[0]; + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + +//@} end member group + +/** @} */ // end group UtilModule + + +} // internal namespace +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_search.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_search.cuh new file mode 100644 index 0000000000..3099080a3c --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/thread/thread_search.cuh @@ -0,0 +1,154 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential search + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Computes the begin offsets into A and B for the specific diagonal + */ +template < + typename AIteratorT, + typename BIteratorT, + typename OffsetT, + typename CoordinateT> +__host__ __device__ __forceinline__ void MergePathSearch( + OffsetT diagonal, + AIteratorT a, + BIteratorT b, + OffsetT a_len, + OffsetT b_len, + CoordinateT& path_coordinate) +{ + /// The value type of the input iterator + typedef typename std::iterator_traits::value_type T; + + OffsetT split_min = CUB_MAX(diagonal - b_len, 0); + OffsetT split_max = CUB_MIN(diagonal, a_len); + + while (split_min < split_max) + { + OffsetT split_pivot = (split_min + split_max) >> 1; + if (a[split_pivot] <= b[diagonal - split_pivot - 1]) + { + // Move candidate split range up A, down B + split_min = split_pivot + 1; + } + else + { + // Move candidate split range up B, down A + split_max = split_pivot; + } + } + + path_coordinate.x = CUB_MIN(split_min, a_len); + path_coordinate.y = diagonal - split_min; +} + + + +/** + * \brief Returns the offset of the first value within \p input which does not compare less than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT LowerBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (input[retval + half] < val) + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + else + { + num_items = half; + } + } + + return retval; +} + + +/** + * \brief Returns the offset of the first value within \p input which compares greater than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT UpperBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (val < input[retval + half]) + { + num_items = half; + } + else + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + } + + return retval; +} + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_store.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_store.cuh new file mode 100644 index 0000000000..ec20b36f40 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/thread/thread_store.cuh @@ -0,0 +1,422 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for writing memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory store operations. + */ +enum CacheStoreModifier +{ + STORE_DEFAULT, ///< Default (no modifier) + STORE_WB, ///< Cache write-back all coherent levels + STORE_CG, ///< Cache at global level + STORE_CS, ///< Cache streaming (likely to be accessed once) + STORE_WT, ///< Cache write-through (to system memory) + STORE_VOLATILE, ///< Volatile shared (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit store using cache-global modifier: + * int *d_out; + * int val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 16-bit store using default modifier + * short *d_out; + * short val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 256-bit store using write-through modifier + * double4 *d_out; + * double4 val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 96-bit store using cache-streaming cache modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * \endcode + * + * \tparam MODIFIER [inferred] CacheStoreModifier enumeration + * \tparam InputIteratorT [inferred] Output iterator type \iterator + * \tparam T [inferred] Data type of output value + */ +template < + CacheStoreModifier MODIFIER, + typename OutputIteratorT, + typename T> +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated store iteration (inductive case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T *ptr, T *vals) + { + ThreadStore(ptr + COUNT, vals[COUNT]); + IterateThreadStore::template Store(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) + { + ptr[COUNT] = vals[COUNT]; + IterateThreadStore::Dereference(ptr, vals); + } + +}; + +/// Helper structure for templated store iteration (termination case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} + + template + static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} +}; + + +/** + * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y), \ + "r"(val.z), \ + "r"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val.x), \ + "l"(val.y)); \ + } + + +/** + * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val.x), \ + "h"(val.y), \ + "h"(val.z), \ + "h"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ + { \ + asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val)); \ + } + +/** + * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ + { \ + asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val)); \ + } + + +/** + * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ + { \ + asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val)); \ + } + + +/** + * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ + { \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " cvt.u8.u16 datum, %1;" \ + " st."#ptx_modifier".u8 [%0], datum;" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"((unsigned short) val)); \ + } + +/** + * Define powers-of-two ThreadStore specializations for the given Cache load modifier + */ +#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ + _CUB_STORE_16(cub_modifier, ptx_modifier) \ + _CUB_STORE_8(cub_modifier, ptx_modifier) \ + _CUB_STORE_4(cub_modifier, ptx_modifier) \ + _CUB_STORE_2(cub_modifier, ptx_modifier) \ + _CUB_STORE_1(cub_modifier, ptx_modifier) \ + + +/** + * Define ThreadStore specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + _CUB_STORE_ALL(STORE_WB, wb) + _CUB_STORE_ALL(STORE_CG, cg) + _CUB_STORE_ALL(STORE_CS, cs) + _CUB_STORE_ALL(STORE_WT, wt) +#else + _CUB_STORE_ALL(STORE_WB, global) + _CUB_STORE_ALL(STORE_CG, global) + _CUB_STORE_ALL(STORE_CS, global) + _CUB_STORE_ALL(STORE_WT, volatile.global) +#endif + + +// Macro cleanup +#undef _CUB_STORE_ALL +#undef _CUB_STORE_1 +#undef _CUB_STORE_2 +#undef _CUB_STORE_4 +#undef _CUB_STORE_8 +#undef _CUB_STORE_16 + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ void ThreadStore( + OutputIteratorT itr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + *itr = val; +} + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + *ptr = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type /*is_primitive*/) +{ + *reinterpret_cast(ptr) = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type /*is_primitive*/) +{ + // Create a temporary using shuffle-words, then store using volatile-words + typedef typename UnitWord::VolatileWord VolatileWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + + VolatileWord words[VOLATILE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; + + IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadStore definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + // Create a temporary using shuffle-words, then store using device-words + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; + + IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for generic modifiers + */ +template +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) +{ + ThreadStore( + itr, + val, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_allocator.cuh b/GraphBLAS/CUDA/local_cub/util_allocator.cuh new file mode 100644 index 0000000000..0e6dd0486e --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_allocator.cuh @@ -0,0 +1,708 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" +#include "util_debug.cuh" + +#include +#include + +#include "host/mutex.cuh" +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingDeviceAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and stream-safe and is capable of managing cached + * device allocations on multiple devices. It behaves as follows: + * + * \par + * - Allocations from the allocator are associated with an \p active_stream. Once freed, + * the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for + * reuse within other streams when all prior work submitted to \p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingDeviceAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ +struct CachingDeviceAllocator +{ + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor + { + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + int device; // device ordinal + cudaStream_t associated_stream; // Associated associated_stream + cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer and device) + BlockDescriptor(void *d_ptr, int device) : + d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Constructor (suitable for searching maps for a range of suitable blocks, given a device) + BlockDescriptor(int device) : + d_ptr(NULL), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device == b.device) + return (a.d_ptr < b.d_ptr); + else + return (a.device < b.device); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device == b.device) + return (a.bytes < b.bytes); + else + return (a.device < b.device); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each device + typedef std::map GpuCachedBytes; + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + cub::Mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + + GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingDeviceAllocator( + bool skip_cleanup = false, + bool debug = false) + : + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + cudaError_t SetMaxCachedBytes( + size_t max_cached_bytes) + { + // Lock + mutex.Lock(); + + if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); + + this->max_cached_bytes = max_cached_bytes; + + // Unlock + mutex.Unlock(); + + return cudaSuccess; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceAllocate( + int device, ///< [in] Device on which to place the allocation + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + *d_ptr = NULL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + device = entrypoint_device; + } + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key(device); + search_key.associated_stream = active_stream; + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); + + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } + else + { + // Search for a suitable cached allocation: lock + mutex.Lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Iterate through the range of cached blocks on the same device in the same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) + && (block_itr->device == device) + && (block_itr->bin == search_key.bin)) + { + // To prevent races with reusing blocks returned by the host but still + // in use by the device, only consider cached blocks that are + // either (from the active stream) or (from an idle stream) + if ((active_stream == block_itr->associated_stream) || + (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) + { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes[device].free -= search_key.bytes; + cached_bytes[device].live += search_key.bytes; + + if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; + } + + // Done searching: unlock + mutex.Unlock(); + } + + // Allocate the block if necessary + if (!found) + { + // Set runtime's current device to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + // Attempt to allocate + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) + { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", + device, (long long) search_key.bytes, (long long) search_key.associated_stream); + + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error + + // Lock + mutex.Lock(); + + // Iterate the range of free blocks on the same device + BlockDescriptor free_key(device); + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + + while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) + { + // No need to worry about synchronization with the device: cudaFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free device memory and destroy stream event. + if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[device].free -= block_itr->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + cached_blocks.erase(block_itr); + + block_itr++; + } + + // Unlock + mutex.Unlock(); + + // Return under error + if (error) return error; + + // Try to allocate again + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; + } + + // Create ready event + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.Lock(); + live_blocks.insert(search_key); + cached_bytes[device].live += search_key.bytes; + mutex.Unlock(); + + if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + + // Attempt to revert back to previous device if necessary + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + } + + // Copy device pointer to output parameter + *d_ptr = search_key.d_ptr; + + if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + return error; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the current device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); + } + + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceFree( + int device, + void* d_ptr) + { + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } + + // Lock + mutex.Lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr, device); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device].live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) + { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes[device].free += search_key.bytes; + + if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + } + } + + // Unlock + mutex.Unlock(); + + // First set to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + if (recached) + { + // Insert the ready event in the associated stream (must have current device set properly) + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + } + else + { + // Free the allocation from the runtime and cleanup the event. + if (CubDebug(error = cudaFree(d_ptr))) return error; + if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; + + if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + } + + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceFree( + void* d_ptr) + { + return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); + } + + + /** + * \brief Frees all cached device allocations on all devices + */ + cudaError_t FreeAllCached() + { + cudaError_t error = cudaSuccess; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + mutex.Lock(); + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free device memory + if (CubDebug(error = cudaFree(begin->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[current_device].free -= begin->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); + + cached_blocks.erase(begin); + } + + mutex.Unlock(); + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Destructor + */ + virtual ~CachingDeviceAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_arch.cuh b/GraphBLAS/CUDA/local_cub/util_arch.cuh new file mode 100644 index 0000000000..28d81e7cd0 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_arch.cuh @@ -0,0 +1,151 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Static architectural properties by SM version. + */ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) + #define CUB_USE_COOPERATIVE_GROUPS +#endif + +/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). +#ifndef CUB_PTX_ARCH + #ifndef __CUDA_ARCH__ + #define CUB_PTX_ARCH 0 + #else + #define CUB_PTX_ARCH __CUDA_ARCH__ + #endif +#endif + + +/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. +#ifndef CUB_RUNTIME_FUNCTION + #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) + #define CUB_RUNTIME_ENABLED + #define CUB_RUNTIME_FUNCTION __host__ __device__ + #else + #define CUB_RUNTIME_FUNCTION __host__ + #endif +#endif + + +/// Number of threads per warp +#ifndef CUB_LOG_WARP_THREADS + #define CUB_LOG_WARP_THREADS(arch) \ + (5) + #define CUB_WARP_THREADS(arch) \ + (1 << CUB_LOG_WARP_THREADS(arch)) + + #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) + #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) +#endif + + +/// Number of smem banks +#ifndef CUB_LOG_SMEM_BANKS + #define CUB_LOG_SMEM_BANKS(arch) \ + ((arch >= 200) ? \ + (5) : \ + (4)) + #define CUB_SMEM_BANKS(arch) \ + (1 << CUB_LOG_SMEM_BANKS(arch)) + + #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) + #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) +#endif + + +/// Oversubscription factor +#ifndef CUB_SUBSCRIPTION_FACTOR + #define CUB_SUBSCRIPTION_FACTOR(arch) \ + ((arch >= 300) ? \ + (5) : \ + ((arch >= 200) ? \ + (3) : \ + (10))) + #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) +#endif + + +/// Prefer padding overhead vs X-way conflicts greater than this threshold +#ifndef CUB_PREFER_CONFLICT_OVER_PADDING + #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ + ((arch >= 300) ? \ + (1) : \ + (4)) + #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) +#endif + + +/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. +#ifndef CUB_SCALED_BLOCK_THREADS + #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + (CUB_MIN( \ + NOMINAL_4B_BLOCK_THREADS, \ + CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ + 2, \ + (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) +#endif + +/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread +#ifndef CUB_SCALED_ITEMS_PER_THREAD + #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + CUB_MAX( \ + 1, \ + (sizeof(T) < 4) ? \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) +#endif + +/// Define both nominal threads-per-block and items-per-thread +#ifndef CUB_SCALED_GRANULARITIES + #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ + CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ + CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) +#endif + + + +#endif // Do not document + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_debug.cuh b/GraphBLAS/CUDA/local_cub/util_debug.cuh new file mode 100644 index 0000000000..3ad832e731 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_debug.cuh @@ -0,0 +1,145 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Error and event logging routines. + * + * The following macros definitions are supported: + * - \p CUB_LOG. Simple event messages are printed to \p stdout. + */ + +#pragma once + +#include +#include "util_namespace.cuh" +#include "util_arch.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/// CUB error reporting macro (prints error messages to stderr) +#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) + #define CUB_STDERR +#endif + + + +/** + * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. + * + * \return The CUDA error. + */ +__host__ __device__ __forceinline__ cudaError_t Debug( + cudaError_t error, + const char* filename, + int line) +{ + (void)filename; + (void)line; +#ifdef CUB_STDERR + if (error) + { + #if (CUB_PTX_ARCH == 0) + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fflush(stderr); + #elif (CUB_PTX_ARCH >= 200) + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); + #endif + } +#endif + return error; +} + + +/** + * \brief Debug macro + */ +#ifndef CubDebug + #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) +#endif + + +/** + * \brief Debug macro with exit + */ +#ifndef CubDebugExit + #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } +#endif + + +/** + * \brief Log macro for printf statements. + */ +#if !defined(_CubLog) + #if !(defined(__clang__) && defined(__CUDA__)) + #if (CUB_PTX_ARCH == 0) + #define _CubLog(format, ...) printf(format,__VA_ARGS__); + #elif (CUB_PTX_ARCH >= 200) + #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); + #endif + #else + // XXX shameless hack for clang around variadic printf... + // Compilies w/o supplying -std=c++11 but shows warning, + // so we sielence them :) + #pragma clang diagnostic ignored "-Wc++11-extensions" + #pragma clang diagnostic ignored "-Wunnamed-type-template-args" + template + inline __host__ __device__ void va_printf(char const* format, Args const&... args) + { + #ifdef __CUDA_ARCH__ + printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); + #else + printf(format, args...); + #endif + } + #ifndef __CUDA_ARCH__ + #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); + #else + #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); + #endif + #endif +#endif + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_device.cuh b/GraphBLAS/CUDA/local_cub/util_device.cuh new file mode 100644 index 0000000000..a5f3b61443 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_device.cuh @@ -0,0 +1,347 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Properties of a given CUDA device and the corresponding PTX bundle + */ + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_namespace.cuh" +#include "util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + */ +template +__host__ __device__ __forceinline__ +cudaError_t AliasTemporaries( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation + void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed + size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +{ + const int ALIGN_BYTES = 256; + const int ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) + { + size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + bytes_needed += ALIGN_BYTES - 1; + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) + { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) + { + return CubDebug(cudaErrorInvalidValue); + } + + // Alias + d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); + for (int i = 0; i < ALLOCATIONS; ++i) + { + allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + + +/** + * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device + */ +template +__global__ void EmptyKernel(void) { } + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) +{ + struct Dummy + { + /// Type definition of the EmptyKernel kernel entry point + typedef void (*EmptyKernelPtr)(); + + /// Force EmptyKernel to be generated if this class is used + CUB_RUNTIME_FUNCTION __forceinline__ + EmptyKernelPtr Empty() + { + return EmptyKernel; + } + }; + + +#ifndef CUB_RUNTIME_ENABLED + (void)ptx_version; + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#elif (CUB_PTX_ARCH > 0) + + ptx_version = CUB_PTX_ARCH; + return cudaSuccess; + +#else + + cudaError_t error = cudaSuccess; + do + { + cudaFuncAttributes empty_kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; + ptx_version = empty_kernel_attrs.ptxVersion * 10; + } + while (0); + + return error; + +#endif +} + + +/** + * \brief Retrieves the SM version (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) +{ +#ifndef CUB_RUNTIME_ENABLED + (void)sm_version; + (void)device_ordinal; + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#else + + cudaError_t error = cudaSuccess; + do + { + // Fill in SM version + int major, minor; + if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; + if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; + sm_version = major * 100 + minor * 10; + } + while (0); + + return error; + +#endif +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Synchronize the stream if specified + */ +CUB_RUNTIME_FUNCTION __forceinline__ +static cudaError_t SyncStream(cudaStream_t stream) +{ +#if (CUB_PTX_ARCH == 0) + return cudaStreamSynchronize(stream); +#else + (void)stream; + // Device can't yet sync on a specific stream + return cudaDeviceSynchronize(); +#endif +} + + +/** + * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. + * + * \par Snippet + * The code snippet below illustrates the use of the MaxSmOccupancy function. + * \par + * \code + * #include // or equivalently + * + * template + * __global__ void ExampleKernel() + * { + * // Allocate shared memory for BlockScan + * __shared__ volatile T buffer[4096]; + * + * ... + * } + * + * ... + * + * // Determine SM occupancy for ExampleKernel specialized for unsigned char + * int max_sm_occupancy; + * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); + * + * // max_sm_occupancy <-- 4 on SM10 + * // max_sm_occupancy <-- 8 on SM20 + * // max_sm_occupancy <-- 12 on SM35 + * + * \endcode + * + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads, ///< [in] Number of threads per thread block + int dynamic_smem_bytes = 0) +{ +#ifndef CUB_RUNTIME_ENABLED + (void)dynamic_smem_bytes; + (void)block_threads; + (void)kernel_ptr; + (void)max_sm_occupancy; + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( + &max_sm_occupancy, + kernel_ptr, + block_threads, + dynamic_smem_bytes); + +#endif // CUB_RUNTIME_ENABLED +} + + +/****************************************************************************** + * Policy management + ******************************************************************************/ + +/** + * Kernel dispatch configuration + */ +struct KernelConfig +{ + int block_threads; + int items_per_thread; + int tile_size; + int sm_occupancy; + + CUB_RUNTIME_FUNCTION __forceinline__ + KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init(KernelPtrT kernel_ptr) + { + block_threads = AgentPolicyT::BLOCK_THREADS; + items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); + return retval; + } +}; + + + +/// Helper for dispatching into a policy chain +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int ptx_version, FunctorT &op) + { + if (ptx_version < PTX_VERSION) { + return PrevPolicyT::Invoke(ptx_version, op); + } + return op.template Invoke(); + } +}; + +/// Helper for dispatching into a policy chain (end-of-chain specialization) +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef PolicyT ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) { + return op.template Invoke(); + } +}; + + + + +#endif // Do not document + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_macro.cuh b/GraphBLAS/CUDA/local_cub/util_macro.cuh new file mode 100644 index 0000000000..ff86365422 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_macro.cuh @@ -0,0 +1,103 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Common C/C++ macro utilities + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +#ifndef CUB_ALIGN + #if defined(_WIN32) || defined(_WIN64) + /// Align struct + #define CUB_ALIGN(bytes) __declspec(align(32)) + #else + /// Align struct + #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) + #endif +#endif + +#ifndef CUB_MAX + /// Select maximum(a, b) + #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) +#endif + +#ifndef CUB_MIN + /// Select minimum(a, b) + #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) +#endif + +#ifndef CUB_QUOTIENT_FLOOR + /// Quotient of x/y rounded down to nearest integer + #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) +#endif + +#ifndef CUB_QUOTIENT_CEILING + /// Quotient of x/y rounded up to nearest integer + #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) +#endif + +#ifndef CUB_ROUND_UP_NEAREST + /// x rounded up to the nearest multiple of y + #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) +#endif + +#ifndef CUB_ROUND_DOWN_NEAREST + /// x rounded down to the nearest multiple of y + #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) +#endif + + +#ifndef CUB_STATIC_ASSERT + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + #define CUB_CAT_(a, b) a ## b + #define CUB_CAT(a, b) CUB_CAT_(a, b) + #endif // DOXYGEN_SHOULD_SKIP_THIS + + /// Static assert + #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] +#endif + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_namespace.cuh b/GraphBLAS/CUDA/local_cub/util_namespace.cuh new file mode 100644 index 0000000000..c8991d08fb --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_namespace.cuh @@ -0,0 +1,46 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Place-holder for prefixing the cub namespace + */ + +#pragma once + +// For example: +//#define CUB_NS_PREFIX namespace thrust{ namespace detail { +//#define CUB_NS_POSTFIX } } + +#ifndef CUB_NS_PREFIX +#define CUB_NS_PREFIX +#endif + +#ifndef CUB_NS_POSTFIX +#define CUB_NS_POSTFIX +#endif diff --git a/GraphBLAS/CUDA/local_cub/util_ptx.cuh b/GraphBLAS/CUDA/local_cub/util_ptx.cuh new file mode 100644 index 0000000000..582ca0d8b8 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_ptx.cuh @@ -0,0 +1,758 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * PTX intrinsics + */ + + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" +#include "util_debug.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilPtx + * @{ + */ + + +/****************************************************************************** + * PTX helper macros + ******************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Register modifier for pointer-types (for inlining PTX assembly) + */ +#if defined(_WIN64) || defined(__LP64__) + #define __CUB_LP64__ 1 + // 64-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "l" + #define _CUB_ASM_PTR_SIZE_ "u64" +#else + #define __CUB_LP64__ 0 + // 32-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "r" + #define _CUB_ASM_PTR_SIZE_ "u32" +#endif + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Inlined PTX intrinsics + ******************************************************************************/ + +/** + * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHR_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x >> shift) + addend; +#endif + return ret; +} + + +/** + * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHL_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x << shift) + addend; +#endif + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Bitfield-extract. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type /*byte_len*/) +{ + unsigned int bits; +#if CUB_PTX_ARCH >= 200 + asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); +#else + const unsigned int MASK = (1 << num_bits) - 1; + bits = (source >> bit_start) & MASK; +#endif + return bits; +} + + +/** + * Bitfield-extract for 64-bit types. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type<8> /*byte_len*/) +{ + const unsigned long long MASK = (1ull << num_bits) - 1; + return (source >> bit_start) & MASK; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits) +{ + return BFE(source, bit_start, num_bits, Int2Type()); +} + + +/** + * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. + */ +__device__ __forceinline__ void BFI( + unsigned int &ret, + unsigned int x, + unsigned int y, + unsigned int bit_start, + unsigned int num_bits) +{ +#if CUB_PTX_ARCH >= 200 + asm ("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); +#else + x <<= bit_start; + unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; + unsigned int MASK_Y = ~MASK_X; + ret = (y & MASK_Y) | (x & MASK_X); +#endif +} + + +/** + * \brief Three-operand add. Returns \p x + \p y + \p z. + */ +__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) +{ +#if CUB_PTX_ARCH >= 200 + asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); +#else + x = x + y + z; +#endif + return x; +} + + +/** + * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. + * + * \par + * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: + * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes + * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within + * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} + * + * \par Snippet + * The code snippet below illustrates byte-permute. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * int a = 0x03020100; + * int b = 0x07060504; + * int index = 0x00007531; + * + * int selected = PRMT(a, b, index); // 0x07050301 + * + * \endcode + * + */ +__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) +{ + int ret; + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Sync-threads barrier. + */ +__device__ __forceinline__ void BAR(int count) +{ + asm volatile("bar.sync 1, %0;" : : "r"(count)); +} + +/** + * CTA barrier + */ +__device__ __forceinline__ void CTA_SYNC() +{ + __syncthreads(); +} + + +/** + * CTA barrier with predicate + */ +__device__ __forceinline__ int CTA_SYNC_AND(int p) +{ + return __syncthreads_and(p); +} + + +/** + * Warp barrier + */ +__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + __syncwarp(member_mask); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __any_sync(member_mask, predicate); +#else + return ::__any(predicate); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __all_sync(member_mask, predicate); +#else + return ::__all(predicate); +#endif +} + + +/** + * Warp ballot + */ +__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __ballot_sync(member_mask, predicate); +#else + return __ballot(predicate); +#endif +} + +/** + * Warp synchronous shfl_up + */ +__device__ __forceinline__ +unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.up.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); +#endif + return word; +} + +/** + * Warp synchronous shfl_down + */ +__device__ __forceinline__ +unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.down.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); +#endif + return word; +} + +/** + * Warp synchronous shfl_idx + */ +__device__ __forceinline__ +unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.idx.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); +#endif + return word; +} + +/** + * Floating point multiply. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FMUL_RZ(float a, float b) +{ + float d; + asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); + return d; +} + + +/** + * Floating point multiply-add. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) +{ + float d; + asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); + return d; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Terminates the calling thread + */ +__device__ __forceinline__ void ThreadExit() { + asm volatile("exit;"); +} + + +/** + * \brief Abort execution and generate an interrupt to the host CPU + */ +__device__ __forceinline__ void ThreadTrap() { + asm volatile("trap;"); +} + + +/** + * \brief Returns the row-major linear thread identifier for a multidimensional thread block + */ +__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) +{ + return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + + threadIdx.x; +} + + +/** + * \brief Returns the warp lane ID of the calling thread + */ +__device__ __forceinline__ unsigned int LaneId() +{ + unsigned int ret; + asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); + return ret; +} + + +/** + * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. + */ +__device__ __forceinline__ unsigned int WarpId() +{ + unsigned int ret; + asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLt() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLe() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGt() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGe() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); + return ret; +} + +/** @} */ // end group UtilPtx + + + + +/** + * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) + * \ingroup WarpModule + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * predecessor of its predecessor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. + * + */ +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleUp( + T input, ///< [in] The value to broadcast + int src_offset, ///< [in] The relative down-offset of the peer to read from + int first_thread, ///< [in] Index of first lane in logical warp (typically 0) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask); + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); + output_alias[WORD] = shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) + * \ingroup WarpModule + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * successor of its successor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. + * + */ +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleDown( + T input, ///< [in] The value to broadcast + int src_offset, ///< [in] The relative up-offset of the peer to read from + int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask); + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); + output_alias[WORD] = shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input + * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, + * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. + * + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from thread 0 + * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. + * + */ +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleIndex( + T input, ///< [in] The value to broadcast + int src_lane, ///< [in] Which warp lane is to do the broadcasting + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], + src_lane, + SHFL_C, + member_mask); + + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], + src_lane, + SHFL_C, + member_mask); + + output_alias[WORD] = shuffle_word; + } + + return output; +} + + + +/** + * Compute a 32b mask of threads having the same least-significant + * LABEL_BITS of \p label as the calling thread. + */ +template +inline __device__ unsigned int MatchAny(unsigned int label) +{ + unsigned int retval; + + // Extract masks of common threads for each bit + #pragma unroll + for (int BIT = 0; BIT < LABEL_BITS; ++BIT) + { + unsigned int mask; + unsigned int current_bit = 1 << BIT; + asm ("{\n" + " .reg .pred p;\n" + " and.b32 %0, %1, %2;" + " setp.eq.u32 p, %0, %2;\n" +#ifdef CUB_USE_COOPERATIVE_GROUPS + " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" +#else + " vote.ballot.b32 %0, p;\n" +#endif + " @!p not.b32 %0, %0;\n" + "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); + + // Remove peers who differ + retval = (BIT == 0) ? mask : retval & mask; + } + + return retval; + +// // VOLTA match +// unsigned int retval; +// asm ("{\n" +// " match.any.sync.b32 %0, %1, 0xffffffff;\n" +// "}\n" : "=r"(retval) : "r"(label)); +// return retval; + +} + + + + + + + + + + + + + + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/util_type.cuh b/GraphBLAS/CUDA/local_cub/util_type.cuh new file mode 100644 index 0000000000..0ba41e1ed2 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/util_type.cuh @@ -0,0 +1,1167 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Common type manipulation (metaprogramming) utilities + */ + +#pragma once + +#include +#include +#include + +#if (__CUDACC_VER_MAJOR__ >= 9) + #include +#endif + +#include "util_macro.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" + + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + + + +/****************************************************************************** + * Type equality + ******************************************************************************/ + +/** + * \brief Type selection (IF ? ThenType : ElseType) + */ +template +struct If +{ + /// Conditional type result + typedef ThenType Type; // true +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct If +{ + typedef ElseType Type; // false +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Conditional types + ******************************************************************************/ + +/** + * \brief Type equality test + */ +template +struct Equals +{ + enum { + VALUE = 0, + NEGATE = 1 + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Equals +{ + enum { + VALUE = 1, + NEGATE = 0 + }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Static math + ******************************************************************************/ + +/** + * \brief Statically determine log2(N), rounded up. + * + * For example: + * Log2<8>::VALUE // 3 + * Log2<3>::VALUE // 2 + */ +template +struct Log2 +{ + /// Static logarithm value + enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Log2 +{ + enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case + COUNT : + COUNT - 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Statically determine if N is a power-of-two + */ +template +struct PowerOfTwo +{ + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + + + +/****************************************************************************** + * Pointer vs. iterator detection + ******************************************************************************/ + +/** + * \brief Pointer vs. iterator + */ +template +struct IsPointer +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsPointer +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Qualifier detection + ******************************************************************************/ + +/** + * \brief Volatile modifier test + */ +template +struct IsVolatile +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsVolatile +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Qualifier removal + ******************************************************************************/ + +/** + * \brief Removes \p const and \p volatile qualifiers from type \p Tp. + * + * For example: + * typename RemoveQualifiers::Type // int; + */ +template +struct RemoveQualifiers +{ + /// Type without \p const and \p volatile qualifiers + typedef Up Type; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + + +/****************************************************************************** + * Marker types + ******************************************************************************/ + +/** + * \brief A simple "NULL" marker type + */ +struct NullType +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template + __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } + + __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } + + __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } + +#endif // DOXYGEN_SHOULD_SKIP_THIS +}; + + +/** + * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) + */ +template +struct Int2Type +{ + enum {VALUE = A}; +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/****************************************************************************** + * Size and alignment + ******************************************************************************/ + +/// Structure alignment +template +struct AlignBytes +{ + struct Pad + { + T val; + char byte; + }; + + enum + { + /// The "true CUDA" alignment of T in bytes + ALIGN_BYTES = sizeof(Pad) - sizeof(T) + }; + + /// The "truly aligned" type + typedef T Type; +}; + +// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree +// with device C++ compilers (EDG) on types passed as template parameters through +// kernel functions + +#define __CUB_ALIGN_BYTES(t, b) \ + template <> struct AlignBytes \ + { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; + +__CUB_ALIGN_BYTES(short4, 8) +__CUB_ALIGN_BYTES(ushort4, 8) +__CUB_ALIGN_BYTES(int2, 8) +__CUB_ALIGN_BYTES(uint2, 8) +__CUB_ALIGN_BYTES(long long, 8) +__CUB_ALIGN_BYTES(unsigned long long, 8) +__CUB_ALIGN_BYTES(float2, 8) +__CUB_ALIGN_BYTES(double, 8) +#ifdef _WIN32 + __CUB_ALIGN_BYTES(long2, 8) + __CUB_ALIGN_BYTES(ulong2, 8) +#else + __CUB_ALIGN_BYTES(long2, 16) + __CUB_ALIGN_BYTES(ulong2, 16) +#endif +__CUB_ALIGN_BYTES(int4, 16) +__CUB_ALIGN_BYTES(uint4, 16) +__CUB_ALIGN_BYTES(float4, 16) +__CUB_ALIGN_BYTES(long4, 16) +__CUB_ALIGN_BYTES(ulong4, 16) +__CUB_ALIGN_BYTES(longlong2, 16) +__CUB_ALIGN_BYTES(ulonglong2, 16) +__CUB_ALIGN_BYTES(double2, 16) +__CUB_ALIGN_BYTES(longlong4, 16) +__CUB_ALIGN_BYTES(ulonglong4, 16) +__CUB_ALIGN_BYTES(double4, 16) + +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; + + +/// Unit-words of data movement +template +struct UnitWord +{ + enum { + ALIGN_BYTES = AlignBytes::ALIGN_BYTES + }; + + template + struct IsMultiple + { + enum { + UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, + IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) + }; + }; + + /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned int, + typename If::IS_MULTIPLE, + unsigned short, + unsigned char>::Type>::Type ShuffleWord; + + /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned long long, + ShuffleWord>::Type VolatileWord; + + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + ulonglong2, + VolatileWord>::Type DeviceWord; + + /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + uint4, + typename If::IS_MULTIPLE, + uint2, + ShuffleWord>::Type>::Type TextureWord; +}; + + +// float2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint2 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef unsigned long long DeviceWord; +#endif + typedef float2 TextureWord; +}; + +// float4 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint4 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef ulonglong2 DeviceWord; +#endif + typedef float4 TextureWord; +}; + + +// char2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef unsigned short ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef unsigned short VolatileWord; + typedef short DeviceWord; +#else + typedef unsigned short VolatileWord; + typedef unsigned short DeviceWord; +#endif + typedef unsigned short TextureWord; +}; + + +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Vector type inference utilities. + ******************************************************************************/ + +/** + * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. + */ +template struct CubVector; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +enum +{ + /// The maximum number of elements in CUDA vector types + MAX_VEC_ELEMENTS = 4, +}; + + +/** + * Generic vector-1 type + */ +template +struct CubVector +{ + T x; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-2 type + */ +template +struct CubVector +{ + T x; + T y; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-3 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-4 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + T w; + + typedef T BaseType; + typedef CubVector Type; +}; + + +/** + * Macro for expanding partially-specialized built-in vector types + */ +#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ + \ + template<> struct CubVector : short_type##1 \ + { \ + typedef base_type BaseType; \ + typedef short_type##1 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##2 \ + { \ + typedef base_type BaseType; \ + typedef short_type##2 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##3 \ + { \ + typedef base_type BaseType; \ + typedef short_type##3 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##4 \ + { \ + typedef base_type BaseType; \ + typedef short_type##4 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + retval.w = w + other.w; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + retval.w = w - other.w; \ + return retval; \ + } \ + }; + + + +// Expand CUDA vector types for built-in primitives +CUB_DEFINE_VECTOR_TYPE(char, char) +CUB_DEFINE_VECTOR_TYPE(signed char, char) +CUB_DEFINE_VECTOR_TYPE(short, short) +CUB_DEFINE_VECTOR_TYPE(int, int) +CUB_DEFINE_VECTOR_TYPE(long, long) +CUB_DEFINE_VECTOR_TYPE(long long, longlong) +CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) +CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) +CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) +CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) +CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) +CUB_DEFINE_VECTOR_TYPE(float, float) +CUB_DEFINE_VECTOR_TYPE(double, double) +CUB_DEFINE_VECTOR_TYPE(bool, uchar) + +// Undefine macros +#undef CUB_DEFINE_VECTOR_TYPE + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Wrapper types + ******************************************************************************/ + +/** + * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions + */ +template +struct Uninitialized +{ + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + WORDS = sizeof(T) / sizeof(DeviceWord) + }; + + /// Backing storage + DeviceWord storage[WORDS]; + + /// Alias + __host__ __device__ __forceinline__ T& Alias() + { + return reinterpret_cast(*this); + } +}; + + +/** + * \brief A key identifier paired with a corresponding value + */ +template < + typename _Key, + typename _Value +#if defined(_WIN32) && !defined(_WIN64) + , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) + , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) +#endif // #if defined(_WIN32) && !defined(_WIN64) + > +struct KeyValuePair +{ + typedef _Key Key; ///< Key data type + typedef _Value Value; ///< Value data type + + Key key; ///< Item key + Value value; ///< Item value + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#if defined(_WIN32) && !defined(_WIN64) + +/** + * Win32 won't do 16B alignment. This can present two problems for + * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: + * 1) If a smaller-aligned item were to be listed first, the host compiler places the + * should-be-16B item at too early an offset (and disagrees with device compiler) + * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size + * of the struct wrong (and disagrees with device compiler) + * + * So we put the larger-should-be-aligned item first, and explicitly pad the + * end of the struct + */ + +/// Smaller key specialization +template +struct KeyValuePair +{ + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; + + Value value; // Value has larger would-be alignment and goes first + Key key; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + + +/// Smaller value specialization +template +struct KeyValuePair +{ + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; + + Key key; // Key has larger would-be alignment and goes first + Value value; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#endif // #if defined(_WIN32) && !defined(_WIN64) + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * \brief A wrapper for passing simple static arrays as kernel parameters + */ +template +struct ArrayWrapper +{ + + /// Statically-sized array of type \p T + T array[COUNT]; + + /// Constructor + __host__ __device__ __forceinline__ ArrayWrapper() {} +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. + * + * Many multi-pass computations require a pair of "ping-pong" storage + * buffers (e.g., one for reading from and the other for writing to, and then + * vice-versa for the subsequent pass). This structure wraps a set of device + * buffers and a "selector" member to track which is "current". + */ +template +struct DoubleBuffer +{ + /// Pair of device buffer pointers + T *d_buffers[2]; + + /// Selector into \p d_buffers (i.e., the active/valid buffer) + int selector; + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer() + { + selector = 0; + d_buffers[0] = NULL; + d_buffers[1] = NULL; + } + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer( + T *d_current, ///< The currently valid buffer + T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current + { + selector = 0; + d_buffers[0] = d_current; + d_buffers[1] = d_alternate; + } + + /// \brief Return pointer to the currently valid buffer + __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } + + /// \brief Return pointer to the currently invalid buffer + __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } + +}; + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + + +/** + * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name + */ +#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ + template \ + struct detector_name \ + { \ + template \ + static char& test(typename C::nested_type_name*); \ + template \ + static int& test(...); \ + enum \ + { \ + VALUE = sizeof(test(0)) < sizeof(int) \ + }; \ + }; + + + +/****************************************************************************** + * Simple enable-if (similar to Boost) + ******************************************************************************/ + +/** + * \brief Simple enable-if (similar to Boost) + */ +template +struct EnableIf +{ + /// Enable-if type for SFINAE dummy variables + typedef T Type; +}; + + +template +struct EnableIf {}; + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + +/** + * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) + */ +template +struct BinaryOpHasIdxParam +{ +private: +/* + template struct SFINAE1 {}; + template struct SFINAE2 {}; + template struct SFINAE3 {}; + template struct SFINAE4 {}; +*/ + template struct SFINAE5 {}; + template struct SFINAE6 {}; + template struct SFINAE7 {}; + template struct SFINAE8 {}; +/* + template static char Test(SFINAE1 *); + template static char Test(SFINAE2 *); + template static char Test(SFINAE3 *); + template static char Test(SFINAE4 *); +*/ + template __host__ __device__ static char Test(SFINAE5 *); + template __host__ __device__ static char Test(SFINAE6 *); + template __host__ __device__ static char Test(SFINAE7 *); + template __host__ __device__ static char Test(SFINAE8 *); + + template static int Test(...); + +public: + + /// Whether the functor BinaryOp has a third unsigned int index param + static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); +}; + + + + +/****************************************************************************** + * Simple type traits utilities. + * + * For example: + * Traits::CATEGORY // SIGNED_INTEGER + * Traits::NULL_TYPE // true + * Traits::CATEGORY // NOT_A_NUMBER + * Traits::PRIMITIVE; // false + * + ******************************************************************************/ + +/** + * \brief Basic type traits categories + */ +enum Category +{ + NOT_A_NUMBER, + SIGNED_INTEGER, + UNSIGNED_INTEGER, + FLOATING_POINT +}; + + +/** + * \brief Basic type traits + */ +template +struct BaseTraits +{ + /// Category + static const Category CATEGORY = _CATEGORY; + enum + { + PRIMITIVE = _PRIMITIVE, + NULL_TYPE = _NULL_TYPE, + }; +}; + + +/** + * Basic type traits (unsigned primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = UNSIGNED_INTEGER; + static const UnsignedBits LOWEST_KEY = UnsignedBits(0); + static const UnsignedBits MAX_KEY = UnsignedBits(-1); + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key; + } + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key; + } + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + + +/** + * Basic type traits (signed primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = SIGNED_INTEGER; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits LOWEST_KEY = HIGH_BIT; + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + +template +struct FpLimits; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ float Max() { + return FLT_MAX; + } + + static __host__ __device__ __forceinline__ float Lowest() { + return FLT_MAX * float(-1); + } +}; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ double Max() { + return DBL_MAX; + } + + static __host__ __device__ __forceinline__ double Lowest() { + return DBL_MAX * double(-1); + } +}; + + +#if (__CUDACC_VER_MAJOR__ >= 9) +template <> +struct FpLimits<__half> +{ + static __host__ __device__ __forceinline__ __half Max() { + unsigned short max_word = 0x7BFF; + return reinterpret_cast<__half&>(max_word); + } + + static __host__ __device__ __forceinline__ __half Lowest() { + unsigned short lowest_word = 0xFBFF; + return reinterpret_cast<__half&>(lowest_word); + } +}; +#endif + + +/** + * Basic type traits (fp primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = FLOATING_POINT; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; + return key ^ mask; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); + return key ^ mask; + }; + + static __host__ __device__ __forceinline__ T Max() { + return FpLimits::Max(); + } + + static __host__ __device__ __forceinline__ T Lowest() { + return FpLimits::Lowest(); + } +}; + + +/** + * \brief Numeric type traits + */ +template struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +#if (__CUDACC_VER_MAJOR__ >= 9) + template <> struct NumericTraits<__half> : BaseTraits {}; +#endif + +template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; + + + +/** + * \brief Type traits + */ +template +struct Traits : NumericTraits::Type> {}; + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_shfl.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_shfl.cuh new file mode 100644 index 0000000000..bbbf37e5c7 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_shfl.cuh @@ -0,0 +1,541 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_ptx.cuh" +#include "../../util_type.cuh" +#include "../../util_macro.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp reduction steps + STEPS = Log2::VALUE, + + /// Number of logical warps in a PTX warp + LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + + }; + + template + struct IsInteger + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + + /// Shared memory storage layout type + typedef NullType TempStorage; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + /// Lane index in logical warp + unsigned int lane_id; + + /// Logical warp index in 32-thread physical warp + unsigned int warp_id; + + /// 32-thread physical warp member mask of logical warp + unsigned int member_mask; + + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpReduceShfl( + TempStorage &/*temp_storage*/) + { + lane_id = LaneId(); + warp_id = 0; + member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + + if (!IS_ARCH_WARP) + { + warp_id = lane_id / LOGICAL_WARP_THREADS; + lane_id = lane_id % LOGICAL_WARP_THREADS; + member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); + } + } + + + //--------------------------------------------------------------------- + // Reduction steps + //--------------------------------------------------------------------- + + /// Reduction (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int ReduceStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across fp32 types) + __device__ __forceinline__ float ReduceStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long ReduceStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across long long types) + __device__ __forceinline__ long long ReduceStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across double types) + __device__ __forceinline__ double ReduceStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); + + output.key = input.key; + output.value = ReduceStep( + input.value, + cub::Sum(), + last_lane, + offset, + Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key != other_key) + output.value = input.value; + + return output; + } + + + + /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } + + + /// Reduction step (generic) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T output = input; + + _T temp = ShuffleDown(output, offset, last_lane, member_mask); + + // Perform reduction op if valid + if (offset + lane_id <= last_lane) + output = reduction_op(input, temp); + + return output; + } + + + /// Reduction step (specialized for small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + /// Reduction step (specialized for types other than small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + //--------------------------------------------------------------------- + // Templated inclusive scan iteration + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ReduceStep( + T& input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + Int2Type /*step*/) + { + input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); + + ReduceStep(input, reduction_op, last_lane, Int2Type()); + } + + template + __device__ __forceinline__ void ReduceStep( + T& /*input*/, ///< [in] Calling thread's input item. + ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator + int /*last_lane*/, ///< [in] Index of last lane in segment + Int2Type /*step*/) + {} + + + //--------------------------------------------------------------------- + // Reduction operations + //--------------------------------------------------------------------- + + /// Reduction + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + int last_lane = (ALL_LANES_VALID) ? + LOGICAL_WARP_THREADS - 1 : + valid_items - 1; + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } + + + /// Segmented reduction + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + // Convert to tail-segmented + if (HEAD_SEGMENTED) + warp_flags >>= 1; + + // Mask out the bits below the current thread + warp_flags &= LaneMaskGe(); + + // Mask of physical lanes outside the logical warp and convert to logical lanemask + if (!IS_ARCH_WARP) + { + warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS); + } + + // Mask in the last lane of logical warp + warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); + + // Find the next set flag + int last_lane = __clz(__brev(warp_flags)); + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_smem.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_smem.cuh new file mode 100644 index 0000000000..7baa573be1 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_smem.cuh @@ -0,0 +1,372 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + + /// FlagT status (when not using ballot) + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + /// Shared memory flag type + typedef unsigned char SmemFlag; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + struct _TempStorage + { + T reduce[WARP_SMEM_ELEMENTS]; + SmemFlag flags[WARP_SMEM_ELEMENTS]; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpReduceSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Regular reduction + //--------------------------------------------------------------------- + + /** + * Reduction step + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp, + int STEP> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + + return ReduceStep(input, valid_items, reduction_op, Int2Type()); + } + + + /** + * Reduction step (terminate) + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp /*reduction_op*/, ///< [in] Reduction operator + Int2Type /*step*/) + { + return input; + } + + + //--------------------------------------------------------------------- + // Segmented reduction + //--------------------------------------------------------------------- + + + /** + * Ballot-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + if (!HEAD_SEGMENTED) + warp_flags <<= 1; + + // Keep bits above the current thread. + warp_flags &= LaneMaskGt(); + + // Accommodate packing of multiple logical warps in a single physical warp + if (!IS_ARCH_WARP) + { + warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + } + + // Find next flag + int next_flag = __clz(__brev(warp_flags)); + + // Clip the next segment at the warp boundary if necessary + if (LOGICAL_WARP_THREADS != 32) + next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); + + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input into buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if (OFFSET + lane_id < next_flag) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + } + + return input; + } + + + /** + * Smem-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + enum + { + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + // Alias flags onto shared data storage + volatile SmemFlag *flag_storage = temp_storage.flags; + + SmemFlag flag_status = (flag) ? SET : UNSET; + + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Get peer from buffer + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + + WARP_SYNC(member_mask); + + // Share flag through buffer + flag_storage[lane_id] = flag_status; + + // Get peer flag from buffer + SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; + + // Update input if peer was in range + if (lane_id < LOGICAL_WARP_THREADS - OFFSET) + { + if (HEAD_SEGMENTED) + { + // Head-segmented + if ((flag_status & SEEN) == 0) + { + // Has not seen a more distant head flag + if (peer_flag_status & SET) + { + // Has now seen a head flag + flag_status |= SEEN; + } + else + { + // Peer is not a head flag: grab its count + input = reduction_op(input, peer_addend); + } + + // Update seen status to include that of peer + flag_status |= (peer_flag_status & SEEN); + } + } + else + { + // Tail-segmented. Simply propagate flag status + if (!flag_status) + { + input = reduction_op(input, peer_addend); + flag_status |= peer_flag_status; + } + + } + } + } + + return input; + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * Reduction + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op) ///< [in] Reduction operator + { + return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); + } + + + /** + * Segmented reduction + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Reduction operator + { + return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_shfl.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_shfl.cuh new file mode 100644 index 0000000000..7f4e1c94bb --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_shfl.cuh @@ -0,0 +1,632 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_type.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + }; + + template + struct IntegerTraits + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + /// Shared memory storage layout type + struct TempStorage {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + /// Lane index in logical warp + unsigned int lane_id; + + /// Logical warp index in 32-thread physical warp + unsigned int warp_id; + + /// 32-thread physical warp member mask of logical warp + unsigned int member_mask; + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpScanShfl( + TempStorage &/*temp_storage*/) + { + lane_id = LaneId(); + warp_id = 0; + member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + + if (!IS_ARCH_WARP) + { + warp_id = lane_id / LOGICAL_WARP_THREADS; + lane_id = lane_id % LOGICAL_WARP_THREADS; + member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scan steps + //--------------------------------------------------------------------- + + /// Inclusive prefix scan step (specialized for summation across int32 types) + __device__ __forceinline__ int InclusiveScanStep( + int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + /// Inclusive prefix scan step (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int InclusiveScanStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp32 types) + __device__ __forceinline__ float InclusiveScanStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long InclusiveScanStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across long long types) + __device__ __forceinline__ long long InclusiveScanStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp64 types) + __device__ __forceinline__ double InclusiveScanStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + +/* + /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePairInclusiveScanStep( + KeyValuePair input, ///< [in] Calling thread's input item. + ReduceBySegmentOp scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } +*/ + + /// Inclusive prefix scan step (generic) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T temp = ShuffleUp(input, offset, first_lane, member_mask); + + // Perform scan op if from a valid peer + _T output = scan_op(temp, input); + if (static_cast(lane_id) < first_lane + offset) + output = input; + + return output; + } + + + /// Inclusive prefix scan step (specialized for small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return ShuffleIndex(input, src_lane, member_mask); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + _T input, ///< [in] Calling thread's input item. + _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + // Iterate scan steps + int segment_first_lane = 0; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output = InclusiveScanStep( + inclusive_output, + scan_op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + + } + + /// Inclusive scan, specialized for reduce-value-by-key + template + __device__ __forceinline__ void InclusiveScan( + KeyValuePair input, ///< [in] Calling thread's input item. + KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ReduceByKeyOp scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); + + unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); + + // Mask away all lanes greater than ours + ballot = ballot & LaneMaskLe(); + + // Find index of first set bit + int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output.value = InclusiveScanStep( + inclusive_output.value, + scan_op.op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); + Update(input, inclusive, exclusive, scan_op, is_integer); + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); + Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_smem.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_smem.cuh new file mode 100644 index 0000000000..3237fcbfe9 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_smem.cuh @@ -0,0 +1,397 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + }; + + /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) + typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpScanSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) + template < + bool HAS_IDENTITY, + int STEP, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &partial, + ScanOp scan_op, + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share partial into buffer + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); + + WARP_SYNC(member_mask); + + // Update partial if addend is in range + if (HAS_IDENTITY || (lane_id >= OFFSET)) + { + T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); + partial = scan_op(addend, partial); + } + WARP_SYNC(member_mask); + + ScanStep(partial, scan_op, Int2Type()); + } + + + /// Basic inclusive scan iteration(template unrolled, base-case specialization) + template < + bool HAS_IDENTITY, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &/*partial*/, + ScanOp /*scan_op*/, + Int2Type /*step*/) + {} + + + /// Inclusive prefix scan (specialized for summation across primitive types) + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + T identity = 0; + ThreadStore(&temp_storage[lane_id], (CellT) identity); + + WARP_SYNC(member_mask); + + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /// Inclusive prefix scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + if (lane_id == src_lane) + { + ThreadStore(temp_storage, (CellT) input); + } + + WARP_SYNC(member_mask); + + return (T)ThreadLoad(temp_storage); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Retrieve aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT /*scan_op*/, + IsIntegerT /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + cub::Sum /*scan_o*/, + Int2Type /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + exclusive = inclusive - input; + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + // Broadcast warp aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + + // Update inclusive with initial value + inclusive = scan_op(initial_value, inclusive); + + // Get exclusive from exclusive + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); + + if (lane_id == 0) + exclusive = initial_value; + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/warp/warp_reduce.cuh b/GraphBLAS/CUDA/local_cub/warp/warp_reduce.cuh new file mode 100644 index 0000000000..189896b071 --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/warp/warp_reduce.cuh @@ -0,0 +1,612 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_reduce_shfl.cuh" +#include "specializations/warp_reduce_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) + * + * \tparam T The reduction input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpReduce} + * \par + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + * \par + * The code snippet below illustrates a single warp sum reduction within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a reduction + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sum to lane0 + * int aggregate = WarpReduce(temp_storage).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. + * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + }; + +public: + + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpReduceShfl, + WarpReduceSmem >::Type InternalWarpReduce; + + #endif // DOXYGEN_SHOULD_SKIP_THIS + + +private: + + /// Shared memory storage layout type for WarpReduce + typedef typename InternalWarpReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + +public: + + /// \smemstorage{WarpReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()) + {} + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); + } + + /** + * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).Sum( + * thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is + * undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + // Determine if we don't need bounds checking + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( + * thread_data, head_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * + */ + template < + typename FlagT> + __device__ __forceinline__ T HeadSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return HeadSegmentedReduce(input, head_flag, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( + * thread_data, tail_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename FlagT> + __device__ __forceinline__ T TailSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return TailSegmentedReduce(input, tail_flag, cub::Sum()); + } + + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + /** + * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp max reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide reductions to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( + * thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, + * \p 95, and \p 127, respectively (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); + } + + /** + * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).Reduce( + * thread_data, cub::Max(), valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is + * undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction operator + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( + * thread_data, head_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T HeadSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( + * thread_data, tail_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T TailSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); + } + + + + //@} end member group +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/local_cub/warp/warp_scan.cuh b/GraphBLAS/CUDA/local_cub/warp/warp_scan.cuh new file mode 100644 index 0000000000..c7af0d343d --- /dev/null +++ b/GraphBLAS/CUDA/local_cub/warp/warp_scan.cuh @@ -0,0 +1,936 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_scan_shfl.cuh" +#include "specializations/warp_scan_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) + * + * \tparam T The scan input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - Supports non-commutative scan operators + * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic scan) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpScan} + * \par + * The code snippet below illustrates four concurrent warp prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, 3, ..., 31}. + * + * \par + * The code snippet below illustrates a single warp prefix sum within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a prefix sum + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), + + /// Whether the data type is an integer (which has fully-associative addition) + IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) + }; + + /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpScanShfl, + WarpScanSmem >::Type InternalWarpScan; + + /// Shared memory storage layout type for WarpScan + typedef typename InternalWarpScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + unsigned int lane_id; + + + + /****************************************************************************** + * Public types + ******************************************************************************/ + +public: + + /// \smemstorage{WarpScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + InclusiveScan(input, inclusive_output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); + } + + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan( + * thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + initial_value, + Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Combination (inclusive & exclusive) prefix scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Data exchange + *********************************************************************/ + //@{ + + /** + * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the warp-wide broadcasts of values from + * lanes0 in each of four warps to all other threads in those warps. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Broadcast from lane0 in each warp to all other threads in the warp + * int warp_id = threadIdx.x / 32; + * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p thread_data will be + * {0, 0, ..., 0} in warp0, + * {32, 32, ..., 32} in warp1, + * {64, 64, ..., 64} in warp2, etc. + */ + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return InternalWarpScan(temp_storage).Broadcast(input, src_lane); + } + + //@} end member group + +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/GraphBLAS/CUDA/matrix.h b/GraphBLAS/CUDA/matrix.h new file mode 100644 index 0000000000..11930475b7 --- /dev/null +++ b/GraphBLAS/CUDA/matrix.h @@ -0,0 +1,72 @@ +//SPDX-License-Identifier: Apache-2.0 + +#define chunksize 128 + +#define ASSERT +#define GB_RESTRICT __restrict__ +//#define GB_GETA( aval, ax, p) aval = (T_Z)ax[ ( p )] +//#define GB_GETB( bval, bx, p) bval = (T_Z)bx[ ( p )] +#define GB_ADD_F( f , s) f = GB_ADD ( f, s ) +#define GB_C_MULT( c, a, b) c = GB_MULT( (a), (b) ) +#define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) ) +#define GB_DOT_TERMINAL ( c ) +//# if ( c == TERMINAL_VALUE) break; + +#include "GB_imin.h" +#include "GB_zombie.h" +#include "GB_nnz.h" +#include "GB_partition.h" +#include "GB_binary_search.h" +#include "GB_search_for_vector_template.c" + +#undef GB_DOT_MERGE +// cij += A(k,i) * B(k,j), for merge operation +#define GB_DOT_MERGE \ +{ \ + GB_GETA ( aki= (T_Z)Ax[pA]) ; /* aki = A(k,i) */ \ + GB_GETB ( bkj= (T_Z)Bx[pB]) ; /* bkj = B(k,j) */ \ + if (cij_exists) \ + { \ + GB_MULTADD (cij, aki, bkj) ; /* cij += aki * bkj */ \ + } \ + else \ + { \ + /* cij = A(k,i) * B(k,j), and add to the pattern */ \ + cij_exists = true ; \ + GB_C_MULT (cij, aki, bkj) ; /* cij = aki * bkj */ \ + } \ +} + + +typedef void (*GxB_binary_function) (void *, const void *, const void *) ; + +#include "GB_opaque.h" + +typedef enum +{ + // for all GrB_Descriptor fields: + GxB_DEFAULT = 0, // default behavior of the method + + // for GrB_OUTP only: + GrB_REPLACE = 1, // clear the output before assigning new values to it + + // for GrB_MASK only: + GrB_COMP = 2, // use the structural complement of the input + GrB_SCMP = 2, // same as GrB_COMP (deprecated; use GrB_COMP instead) + GrB_STRUCTURE = 4, // use the only pattern of the mask, not its values + + // for GrB_INP0 and GrB_INP1 only: + GrB_TRAN = 3, // use the transpose of the input + + // for GxB_GPU_CONTROL only: + GxB_GPU_ALWAYS = 4, + GxB_GPU_NEVER = 5, + + // for GxB_AxB_METHOD only: + GxB_AxB_GUSTAVSON = 1001, // gather-scatter saxpy method + GxB_AxB_DOT = 1003, // dot product + GxB_AxB_HASH = 1004, // hash-based saxpy method + GxB_AxB_SAXPY = 1005 // saxpy method (any kind) +} +GrB_Desc_Value ; + diff --git a/GraphBLAS/CUDA/stringify.cpp b/GraphBLAS/CUDA/stringify.cpp new file mode 100644 index 0000000000..47ebbe77dd --- /dev/null +++ b/GraphBLAS/CUDA/stringify.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Stringify is a simple utility to convert text files to C string literals. + */ + +#include +#include +#include +#include + +// Replaces non-alphanumeric characters with '_' and +// prepends '_' if the string begins with a digit. +std::string sanitize_varname(std::string const& s) { + std::string r = s; + if (std::isdigit(r[0])) { + r = '_' + r; + } + for (std::string::iterator it = r.begin(); it != r.end(); ++it) { + if (!std::isalnum(*it)) { + *it = '_'; + } + } + return r; +} +// Replaces " with \" +std::string sanitize_string_literal(std::string const& s) { + std::stringstream ss; + for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) { + if (*it == '"' || *it == '\\') { + ss << '\\'; + } + ss << *it; + } + return ss.str(); +} + +int main(int argc, char* argv[]) { + if (argc <= 1 || argv[1][0] == '-') { + std::cout << "Stringify - Converts text files to C string literals" + << std::endl; + std::cout << "Usage: " << argv[0] << " infile [varname] > outfile" + << std::endl; + return -1; + } + char* filename = argv[1]; + std::string varname = (argc > 2) ? argv[2] : sanitize_varname(filename); + std::ifstream istream(filename); + std::ostream& ostream = std::cout; + std::string line; + // Note: This puts "filename\n" at the beginning of the string, which is + // what jitify expects. + ostream << "const char* const " << varname << " = " + << "\"" << filename << "\\n\"" << std::endl; + while (std::getline(istream, line)) { + ostream << "\"" << sanitize_string_literal(line) << "\\n\"" << std::endl; + } + ostream << ";" << std::endl; + return 0; +} diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase1.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase1.cu new file mode 100644 index 0000000000..aa367c7b9a --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase1.cu @@ -0,0 +1,598 @@ +//------------------------------------------------------------------------------ +// templates/GB_AxB_cuda_dot3_phase1: symbolic load balancing and data partition +// to assign work to different 'buckets' for later compute +//------------------------------------------------------------------------------ + +// This kernel scans the non-zero pattern in A and B, takes into account the +// mask and computes total work required to form C. Then it classifies each +// dot product into a set of buckets for efficient compute. + +#define GB_KERNEL +#include +#include +#include "matrix.h" +#include "GB_cuda_buckets.h" +#include "local_cub/block/block_scan.cuh" +#include "mySemiRing.h" + +//------------------------------------------------------------------------------ +// GB_bucket_assignment +//------------------------------------------------------------------------------ + +// assign the dot product C(i,j) = A(:,i)'*B(:,j) to a specific bucket +__device__ static inline GB_bucket_code GB_bucket_assignment +( + int64_t ainz, // # of entries A(:,i), always > 0 + int64_t bjnz, // # of entries B(:,j), always > 0 + int64_t vlen // vector length of A(:,i) and B(:,j) +) +{ + + int b = 0 ; // no bucket assigned yet + + // GB_BUCKET (condition,bucket) : assigns an entry to a bucket, + // if the condition holds, but without using any if statements. + // An entry is assigned once and not reassigned. + + // If the bucket b has not assigned, it is b = 0. The GB_BUCKET function + // tests this case, and if the condition is also true, the expression + // (b==0) * condition * (bucket+1) becomes equal to bucket+1. This + // value is added to b, which is zero, so the final result is that b + // is set to bucket+1. + + // If the bucket b has been assigned already, we have b > 0. Thus, + // the expression ((b==0) * condition * (bucket+1)) becomes zero. + // When added to b, the result is that b doesn't change, so the bucket + // assignment b is unmodified. + + #define GB_BUCKET(condition,bucket) \ + b = (((b == 0) * (condition)) * (bucket+1)) + b ; + +// if (ia_last < ib_first || ib_last < ia_first) + { + + //---------------------------------------------------------------------- + // pattern of A(:,i) and B(:,j) do not overlap + //---------------------------------------------------------------------- + + // The patterns of A(:,i) and B(:,j) are always sorted. If the last + // entry in A(:,i) comes before the first entry in B(:,j), or visa + // versa, then there is no work to do since C(i,j) must be a zombie. + + // GB_BUCKET (ia_last < ib_first || ib_last < ia_first, GB_BUCKET_ZOMBIE); + + } +// else if (bjnz == vlen && ainz == vlen && vlen > 256) + { + + //---------------------------------------------------------------------- + // both A(:,i) and B(:,j) are dense + //---------------------------------------------------------------------- + + // No search of A(:,i) or B(:,j) is needed. Total work is O(vlen). + // The intersection is non-empty, so C(i,j) cannot be a zombie. + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit + + GB_BUCKET (bjnz == vlen && ainz == vlen && vlen > 256, GB_BUCKET_DNDN) ; + + } +// else if (ainz == vlen) + { + + //---------------------------------------------------------------------- + // A(:,i) is dense and B(:,j) is sparse + //---------------------------------------------------------------------- + + // No search of A(:,i) is needed. Total work is O(bjnz), via a linear + // time scan of B(:,j). Since A(:,i) is dense and B(:,j) is non-empty, + // the intersection is non-empty, so C(i,j) cannot be a zombie. + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit + // Two buckets are used, depending on bjnz. + GB_BUCKET (ainz == vlen && bjnz < 256, GB_BUCKET_DNVS) ; + GB_BUCKET (ainz == vlen && bjnz >= 256, GB_BUCKET_DNSP) ; + + } +// else if (bjnz == vlen) + { + + //---------------------------------------------------------------------- + // A(:,i) is sparse and B(:,j) is dense + //---------------------------------------------------------------------- + + // No search of B(:,j) is needed. Total work is O(ainz), via a linear + // time scan of A(:,i). Since B(:,j) is dense and A(:,i) is non-empty, + // the intersection is non-empty, so C(i,j) cannot be a zombie. + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit + // Two buckets are used, depending on ainz. + GB_BUCKET (bjnz == vlen && ainz < 256, GB_BUCKET_VSDN) ; + GB_BUCKET (bjnz == vlen && ainz >= 256, GB_BUCKET_SPDN) ; + + } +// else if ((ainz > 32 * bjnz && bjnz < 256) +// || (bjnz > 32 * ainz && ainz < 256)) + { + + //---------------------------------------------------------------------- + // A(:,i) is very sparse compared to B(:,j), or visa versa + //---------------------------------------------------------------------- + + // Since B(:,j) is small, and much smaller than A(:,i), the efficient + // way to compute C(i,j) is a linear scan of B(:,j). For each B(k,j), + // a binary search for the index A(k,i) is done. The expected work to + // compute C(i,j) is thus O(bjnz * log2 (ainz)). If A(:,i) is very + // sparse compared to B(:,j), the opposite is done inside the kernel. + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit + + GB_BUCKET ((ainz > 32 * bjnz && bjnz < 256) + || (bjnz > 32 * ainz && ainz < 256), GB_BUCKET_VSSP) ; + + } +// else if (ainz + bjnz <= 4) + { + + //---------------------------------------------------------------------- + // both A(:,i) and B(:,j) are very tiny (total size 4 or less) + //---------------------------------------------------------------------- + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit + //GB_BUCKET (ainz + bjnz <= 4, GB_BUCKET_VSVS_4) ; + + } +// else if (ainz + bjnz <= 16) + { + + //---------------------------------------------------------------------- + // both A(:,i) and B(:,j) are tiny (total size 16 or less) + //---------------------------------------------------------------------- + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit + //GB_BUCKET (ainz + bjnz <= 16, GB_BUCKET_VSVS_16) ; + + } +// else if (ainz + bjnz <= 64) + { + + //---------------------------------------------------------------------- + // both A(:,i) and B(:,j) are small (total size 64 or less) + //---------------------------------------------------------------------- + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit + //GB_BUCKET (ainz + bjnz <= 64, GB_BUCKET_VSVS_64) ; + + } +// else if (ainz + bjnz <= 256) + { + + //---------------------------------------------------------------------- + // both A(:,i) and B(:,j) are modest in size (total size 256 or less) + //---------------------------------------------------------------------- + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit + GB_BUCKET (ainz + bjnz <= 256, GB_BUCKET_VSVS_256) ; + + } +// else + { + + //---------------------------------------------------------------------- + // default: use the merge-path method + //---------------------------------------------------------------------- + + // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_mp.cu.jit + GB_BUCKET (true, GB_BUCKET_MERGEPATH) ; + } + + // subtract one to undo the "bucket+1" assignment in the + // GB_BUCKET macro assignment expression. + return (GB_bucket_code) (b-1) ; +} + + +//-------------------------------------------------------------------------- +// GB_AxB_cuda_dot3_phase1: build nanobuckets, hunt for pre-zombies +//-------------------------------------------------------------------------- + +// GB_AxB_cuda_dot3_phase1 is a CUDA kernel that scans all entries in C and +// assigns them to each of the 12 buckets. The output is a 12-by-blockDim array of +// bucket counts, per threadblock (the nanobucket array). Each of the blockDim.x +// threads has its own set of 12 bucket counts. Each threadblock in this +// kernel then computes the first part of the cumulative sum of the +// nanobuckets, and writes it to global memory. + +// The kernel also computes Ci, of size nnz(C), which contains the +// zombie assignment or bucket assignment for non-zombies in C. + +template +__global__ void GB_AxB_cuda_dot3_phase1 +( + // outputs, preallocated in global memory: + int64_t *nanobuckets, // array of size 12-blockDim.x-by-gridDim.x + int64_t *blockbucket, // bucket counts, of size 12-by-gridDim.x + // input/output: + GrB_Matrix C, // final output matrix + // inputs, not modified: + const GrB_Matrix M, // mask matrix + const GrB_Matrix A, // input matrix + const GrB_Matrix B // input matrix +) +{ + + //-------------------------------------------------------------------------- + // get C, M, A, and B + //-------------------------------------------------------------------------- + + const int64_t *__restrict__ Mh = M->h ; + const int64_t *__restrict__ Mp = M->p ; + const int64_t *__restrict__ Mi = M->i ; + const Type_M *__restrict__ Mx = (Type_M*)M->x ; // not accessed if M is structural + const int64_t mnvec = M->nvec ; + const int64_t mnz = GB_NNZ(M) ; + const bool M_is_hyper = M->is_hyper ; + + const int64_t *__restrict__ Ah = A->h ; + const int64_t *__restrict__ Ap = A->p ; + const int64_t *__restrict__ Ai = A->i ; + const int64_t avlen = A->vlen ; + const int64_t anz = GB_NNZ(A) ; + const bool A_is_hyper = A->is_hyper ; + + const int64_t *__restrict__ Bh = B->h ; + const int64_t *__restrict__ Bp = B->p ; + const int64_t *__restrict__ Bi = B->i ; + const int64_t bvlen = B->vlen ; + const int64_t bnz = GB_NNZ(B); + const bool B_is_hyper = B->is_hyper ; + + // int64_t *restrict Cp = C->p ; // copy of Mp + // int64_t *restrict Ch = C->h ; // copy of Mh + int64_t *__restrict__ Ci = C->i ; // for zombies, or bucket assignment + + // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a + // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector + // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and + // where bucket is the bucket assignment for C(i,j). + // bucket can be recovered from Ci by bucket = Ci & 0xF + + //-------------------------------------------------------------------------- + // clear the bucket counters + //-------------------------------------------------------------------------- + + //ASSERT (mnz > 0) ; + //ASSERT (gridDim.x <= mnz) ; + + // each thread uses 12 bucket counters, held in register + int64_t my_bucket_0 = 0 ; + int64_t my_bucket_1 = 0 ; + int64_t my_bucket_2 = 0 ; + int64_t my_bucket_3 = 0 ; + int64_t my_bucket_4 = 0 ; + int64_t my_bucket_5 = 0 ; + int64_t my_bucket_6 = 0 ; + int64_t my_bucket_7 = 0 ; + int64_t my_bucket_8 = 0 ; + int64_t my_bucket_9 = 0 ; + int64_t my_bucket_10 = 0 ; + int64_t my_bucket_11 = 0 ; + + // Registers cannot be indexed (!) so this macro is used instead. + // The bucket registers are indexed by the GB_bucket_code enum. + #define GB_BUCKET_COUNT(bucket) \ + { \ + switch (bucket) \ + { \ + case 0: my_bucket_0++ ; break ; \ + case 1: my_bucket_1++ ; break ; \ + case 2: my_bucket_2++ ; break ; \ + case 3: my_bucket_3++ ; break ; \ + case 4: my_bucket_4++ ; break ; \ + case 5: my_bucket_5++ ; break ; \ + case 6: my_bucket_6++ ; break ; \ + case 7: my_bucket_7++ ; break ; \ + case 8: my_bucket_8++ ; break ; \ + case 9: my_bucket_9++ ; break ; \ + case 10: my_bucket_10++ ; break ; \ + case 11: my_bucket_11++ ; break ; \ + } \ + } + /* + if(threadIdx.x==0 ) { + printf(" in phase1 kernel, mnz,anz,bnz= %ld,%ld,%ld\n",mnz,anz,bnz); + } + __syncthreads(); + */ + #define pointerchunk 256 + + __shared__ int64_t Mps[pointerchunk]; + __shared__ int64_t ks [chunksize]; + + //-------------------------------------------------------------------------- + // compute the task descriptor + //-------------------------------------------------------------------------- + + // all threads in this block will compute the same values for these: + int32_t pfirst, plast, kfirst, klast ; + /* + for ( int tid_global = threadIdx.x + blockIdx.x * blockDim.x ; + tid_global < (mnvec+ 7)/8 ; + tid_global += blockDim.x*gridDim.x) + */ + int chunk_max= (mnz + chunksize -1)/chunksize; + for ( int chunk = blockIdx.x; + chunk < chunk_max; + chunk += gridDim.x ) + { + + // The slice for each task contains entries pfirst:plast-1 of M and C. + //GB_PARTITION (pfirst, plast, mnz, chunk, (mnz+1023)/1024 ) ; + pfirst = chunksize * chunk ; + plast = GB_IMIN( chunksize * (chunk+1), mnz ) ; + + int chunk_end; + if ( mnz > chunksize) chunk_end = GB_IMIN( chunksize, + mnz - chunksize*(chunk) ) ; + else chunk_end = mnz; + + // find the first vector of the slice for task tid_global: the + // vector that owns the entry Ai [pfirst] and Ax [pfirst]. + kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec) -1 ; + //if( pfirst ==0) kfirst = 0; + + // find the last vector of the slice for task blockIdx.x: the + // vector that owns the entry Ai [plast-1] and Ax [plast-1]. + klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec) ; + + int k_end = GB_IMIN( pointerchunk , klast - kfirst +2 ) ; + /* + if( threadIdx.x ==0) + { + printf("chunk%d pfirst,plast,ch_end =%d,%d,%d kfirst,klast,kend = %d,%d,%d\n", + chunk, pfirst, plast, chunk_end, kfirst, klast, k_end ) ; + } + __syncthreads(); + */ + + + // load pointer values for this chunk + for ( int i = threadIdx.x; i< k_end; i+= blockDim.x) + { + Mps[i] = Mp[i + kfirst]; + } + __syncthreads(); + + // search for k values for each entry + float slope = (float)(mnvec)/(float)(mnz* chunksize) ; + for ( int i = threadIdx.x; i< chunk_end; i+= blockDim.x) + { + ks[i] = kfirst + slope*( float )(i); + while ( Mps[ ks[i] - kfirst + 1 ] <= (i+pfirst) ) + ks[i]++; + while ( Mps[ ks[i] - kfirst ] > (i+pfirst) ) + ks[i]--; + } + __syncthreads(); + + + //ASSERT (0 <= kfirst && kfirst <= klast && klast < mnvec) ; + /* + if (threadIdx.x ==0 ) { + printf ("threadblock %d after ksearch pfirst %ld plast %ld kfirst %ld klast %ld\n", + blockIdx.x, pfirst, plast, kfirst, klast) ; + } + __syncthreads(); + */ + + + + //-------------------------------------------------------------------------- + // assign entries in C(i,j) to the buckets + //-------------------------------------------------------------------------- + + // if B is hypersparse, bpleft ... TODO describe + // int64_t bpleft = 0 ; + + //---------------------------------------------------------------------- + // no binary search variant + //---------------------------------------------------------------------- + + //printf ("no binary search\n") ; + + //int32_t pM_start, pM_end ; + //for (int64_t pM = pfirst + threadIdx.x ; pM < plast ; pM += blockDim.x) + int32_t i,j; + int32_t k = kfirst ; + + //for (int64_t pM = pfirst; pM < plast; pM++ ) + for ( int pM = pfirst + threadIdx.x; + pM < pfirst + chunk_end; + pM += blockDim.x ) + { + GB_bucket_code bucket = GB_BUCKET_ZOMBIE ; + k = ks[ pM - pfirst ] ; + //k += ( pM == Mp[k+1] ) ; + //printf ("tid%d k %ld pM %ld\n", tid_global, k, pM; + i = Mi [ pM ] ; + + if ( MX ( pM ) ) + { + + // do a binary search for k (and j) that has this entry M(i,j) + //k = GB_search_for_vector_device (pM, Mp, k, klast) ; + +// HACK +j = k ; +// int64_t j = (Mh == NULL) ? k : Mh [k] ; + + //-------------------------------------------------------------- + // get B(:,j) + //-------------------------------------------------------------- + + int64_t pB, pB_end ; +// HACK: for sparse only, not hypersparse +pB = Bp [j] ; +pB_end = Bp [j+1] ; +// GB_lookup_device (B_is_hyper, Bh, Bp, &bpleft, bnvec-1, j, +// &pB, &pB_end) ; + int64_t bjnz = pB_end - pB ; + if (bjnz > 0) + { + // int64_t ib_first = Bi [pB] ; + // int64_t ib_last = Bi [pB_end-1] ; + + //---------------------------------------------------------- + // get A(:,i) + //---------------------------------------------------------- + + int64_t pA, pA_end ; + //int64_t apleft = 0 ; +// HACK: for sparse only, not hypersparse +pA = Ap [i] ; +pA_end = Ap [i+1] ; +// GB_lookup_device (A_is_hyper, Ah, Ap, &apleft, anvec-1, i, +// &pA, &pA_end) ; + int64_t ainz = pA_end - pA ; + if (ainz > 0) + { + // int64_t ia_first = Ai [pA] ; + // int64_t ia_last = Ai [pA_end-1] ; + + //------------------------------------------------------ + // determine the bucket for C(i,j) + //------------------------------------------------------ + + //bucket = GB_BUCKET_MERGEPATH ; + bucket= GB_bucket_assignment ( ainz, bjnz, bvlen) ; + } + } + } + + if (bucket == GB_BUCKET_ZOMBIE) + { + // mark C(i,j) is a zombie + //printf ("tid%d pM=%d %d,%d prezombie\n",threadIdx.x,pM,i,j) ; + Ci [pM] = GB_FLIP (i) << 4 ; + // GB_BUCKET_COUNT (GB_BUCKET_ZOMBIE) ; + my_bucket_0++ ; //0 is the zombie bucket + } + else + { + // place C(i,j) in its bucket + Ci [pM] = (k << 4) + bucket ; + GB_BUCKET_COUNT (bucket) ; + //printf ("tid%d pM=%d %d,%d b=%d\n",threadIdx.x, pM, i,j, (int)bucket) ; + } + } + + + + } + __syncthreads(); + + //-------------------------------------------------------------------------- + // cumulative sum of each bucket + //-------------------------------------------------------------------------- + + typedef cub::BlockScan BlockCumSum; + __shared__ typename BlockCumSum::TempStorage temp_storage; + + // The taskbucket for this thread block is an array of size + // 12-by-blockDim.x, held by row. Each thread owns one column of this + // taskbucket, the nanobucket. The nanobucket is a column of length 12, + // with stride equal to blockDim.x. + int64_t *nanobucket = + nanobuckets + blockIdx.x * (12 * blockDim.x) + threadIdx.x ; + + #define CUMSUM_AND_STORE_NANOBUCKET(bucket) \ + if( threadIdx.x == blockDim.x-1) \ + blockbucket [blockIdx.x + bucket * gridDim.x] = \ + my_bucket_ ## bucket ; \ + BlockCumSum(temp_storage).ExclusiveSum \ + ( my_bucket_ ## bucket, my_bucket_ ## bucket) ; \ + __syncthreads(); \ + nanobucket [bucket * blockDim.x] = my_bucket_ ## bucket ; + + CUMSUM_AND_STORE_NANOBUCKET (0) ; + CUMSUM_AND_STORE_NANOBUCKET (1) ; + CUMSUM_AND_STORE_NANOBUCKET (2) ; + CUMSUM_AND_STORE_NANOBUCKET (3) ; + CUMSUM_AND_STORE_NANOBUCKET (4) ; + CUMSUM_AND_STORE_NANOBUCKET (5) ; + CUMSUM_AND_STORE_NANOBUCKET (6) ; + CUMSUM_AND_STORE_NANOBUCKET (7) ; + CUMSUM_AND_STORE_NANOBUCKET (8) ; + CUMSUM_AND_STORE_NANOBUCKET (9) ; + CUMSUM_AND_STORE_NANOBUCKET (10) ; + CUMSUM_AND_STORE_NANOBUCKET (11) ; + + /* + if(threadIdx.x +blockIdx.x*blockDim.x <= mnvec) //blockDim.x -1){ + { + printf("thd %d blk%d nbucket0 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[0]); + printf("thd %d blk%d nbucket1 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[1*blockDim.x]); + printf("thd %d blk%d nbucket2 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[2*blockDim.x]); + printf("thd %d blk%d nbucket3 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[3*blockDim.x]); + printf("thd %d blk%d nbucket4 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[4*blockDim.x]); + printf("thd %d blk%d nbucket5 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[5*blockDim.x]); + printf("thd %d blk%d nbucket6 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[6*blockDim.x]); + printf("thd %d blk%d nbucket7 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[7*blockDim.x]); + printf("thd %d blk%d nbucket8 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[8*blockDim.x]); + printf("thd %d blk%d nbucket9 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[9*blockDim.x]); + printf("thd %d blk%d nbucket10 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[10*blockDim.x]); + printf("thd %d blk%d nbucket11 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[11*blockDim.x]); + + } + __syncthreads(); + */ + + + // The last thread now has the sum of all nanobuckets, which is then saved + // to the global bucket counts. blockbucket is an array of size + // 12-by-gridDim.x, held by row, with one column per thread block. + // The last thread saves its result in the column of this thread block. + // Note that this write to global memory is not coalesced. + + #define STORE_GLOBAL_BUCKET_COUNT(bucket) \ + blockbucket [blockIdx.x + bucket * gridDim.x] += \ + my_bucket_ ## bucket ; + + if (threadIdx.x == blockDim.x - 1 ) + { + STORE_GLOBAL_BUCKET_COUNT (0) ; + STORE_GLOBAL_BUCKET_COUNT (1) ; + STORE_GLOBAL_BUCKET_COUNT (2) ; + STORE_GLOBAL_BUCKET_COUNT (3) ; + STORE_GLOBAL_BUCKET_COUNT (4) ; + STORE_GLOBAL_BUCKET_COUNT (5) ; + STORE_GLOBAL_BUCKET_COUNT (6) ; + STORE_GLOBAL_BUCKET_COUNT (7) ; + STORE_GLOBAL_BUCKET_COUNT (8) ; + STORE_GLOBAL_BUCKET_COUNT (9) ; + STORE_GLOBAL_BUCKET_COUNT (10) ; + STORE_GLOBAL_BUCKET_COUNT (11) ; + } + + /* + if(threadIdx.x == blockDim.x -1){ + + printf("block%d bbucket0 has %ld entries\n",blockIdx.x, blockbucket[0*gridDim.x+blockIdx.x]); + printf("block%d bbucket1 has %ld entries\n",blockIdx.x, blockbucket[1*gridDim.x+blockIdx.x]); + printf("block%d bbucket2 has %ld entries\n",blockIdx.x, blockbucket[2*gridDim.x+blockIdx.x]); + printf("block%d bbucket3 has %ld entries\n",blockIdx.x, blockbucket[3*gridDim.x+blockIdx.x]); + printf("block%d bbucket4 has %ld entries\n",blockIdx.x, blockbucket[4*gridDim.x+blockIdx.x]); + printf("block%d bbucket5 has %ld entries\n",blockIdx.x, blockbucket[5*gridDim.x+blockIdx.x]); + printf("block%d bbucket6 has %ld entries\n",blockIdx.x, blockbucket[6*gridDim.x+blockIdx.x]); + printf("block%d bbucket7 has %ld entries\n",blockIdx.x, blockbucket[7*gridDim.x+blockIdx.x]); + printf("block%d bbucket8 has %ld entries\n",blockIdx.x, blockbucket[8*gridDim.x+blockIdx.x]); + printf("block%d bbucket9 has %ld entries\n",blockIdx.x, blockbucket[9*gridDim.x+blockIdx.x]); + printf("block%d bbucket10 has %ld entries\n",blockIdx.x, blockbucket[10*gridDim.x+blockIdx.x]); + printf("block%d bbucket11 has %ld entries\n",blockIdx.x, blockbucket[11*gridDim.x+blockIdx.x]); + + } + __syncthreads(); + */ + +} + diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase2.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase2.cu new file mode 100644 index 0000000000..b4447c60ff --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase2.cu @@ -0,0 +1,436 @@ +//------------------------------------------------------------------------------ +// templates/GB_AxB_cuda_dot3_phase2: fill the global buckets +//------------------------------------------------------------------------------ + +// TODO describe me + +#define GB_KERNEL +#include +#include "GB_cuda_buckets.h" +#include "matrix.h" +#include +#include "local_cub/block/block_scan.cuh" + +using namespace cooperative_groups; + +// A stateful callback functor that maintains a running prefix to be applied +// during consecutive scan operations. +struct BlockPrefixCallbackOp +{ + // Running prefix + int64_t running_total; + // Constructor + __device__ BlockPrefixCallbackOp(int64_t running_total) : running_total(running_total) {} + + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide scan. + __device__ int64_t operator()(int64_t block_aggregate) + { + int64_t old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +__inline__ +__device__ void blockBucketExclusiveSum(int bucketId, int64_t *d_data, int nblocks) +{ + #define blocksize 32 + + // Specialize BlockScan for a 1D block of 32 threads + typedef cub::BlockScan BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Initialize running total + BlockPrefixCallbackOp prefix_op(0); + + // Have the block iterate over segments of items + int64_t data=0; + + int64_t *blockbucket= d_data; + + for (int block_id = 0; block_id < nblocks; block_id += blocksize) + { + // Load a segment of consecutive items that are blocked across threads + + //printf("block %d entering sum\n",blockIdx.x); + int loc = block_id + threadIdx.x; + if ( loc < nblocks) + { + //printf("block %di loading tid=%d\n",block_id,tid); + data = blockbucket[bucketId*nblocks +loc ] ; + } + __syncthreads(); + + //printf("bb%d_%d s0 before prefix= %ld \n", block_id,bucketId, + // blockbucket[bucketId*nblocks + block_id+threadIdx.x] ) ; + // Collectively compute the block-wide exclusive prefix sum + BlockScan(temp_storage).ExclusiveSum( data, data, prefix_op); + __syncthreads(); + + if ( loc < nblocks) + { + blockbucket[bucketId*nblocks +loc ] = data ; + } + __syncthreads(); + + //printf("bb%d_%d = %ld \n", block_id, bucketId, blockbucket[bucketId*nblocks+block_id+threadIdx.x] ) ; + + data = 0; + } +} + + +template< typename T, int tile_sz> +__inline__ __device__ +T warp_ReduceSumPlus( thread_block_tile tile, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = tile.size() / 2; i > 0; i /= 2) { + val += tile.shfl_down( val, i); + } + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T block_ReduceSum(thread_block g, T val) +{ + static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + thread_block_tile tile = tiled_partition( g ); + + // Each warp performs partial reduction + val = warp_ReduceSumPlus( tile, val); + + // Wait for all partial reductions + if (lane==0) { + //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val); + shared[wid]=val; // Write reduced value to shared memory + //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val); + } + __syncthreads(); // Wait for all partial reductions + + if (wid > 0 ) return val ; + //Final reduce within first warp + if (wid==0) val = warp_ReduceSumPlus( tile, val) ; + + return val; +} + +// GB_AxB_cuda_dot3_phase2 is a CUDA kernel that takes as input the +// nanobuckets and blockbucket arrays computed by the first phase kernel, +// GB_AxB_cuda_dot3_phase1. The launch geometry of this kernel must match the +// GB_AxB_cuda_dot3_phase1 kernel, with the same # of threads and threadblocks. + +__global__ +void GB_AxB_dot3_phase2 +( + // input, not modified: + int64_t *__restrict__ nanobuckets, // array of size 12-blockDim.x-by-nblocks + int64_t *__restrict__ blockbucket, // global bucket count, of size 12*nblocks + // output: + int64_t *__restrict__ bucketp, // global bucket cumsum, of size 13 + int64_t *__restrict__ bucket, // global buckets, of size cnz (== mnz) + int64_t *__restrict__ offset, // global offsets, for each bucket + // inputs, not modified: + GrB_Matrix C, // output matrix + const int64_t cnz, // number of entries in C and M + const int nblocks // input number of blocks to reduce +) +{ + + //-------------------------------------------------------------------------- + // get C and M + //-------------------------------------------------------------------------- + + //int64_t *Ci = C->i ; // for zombies, or bucket assignment + + // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a + // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector + // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and + // where bucket is the bucket assignment for C(i,j). This phase does not + // need k, just the bucket for each entry C(i,j). + + //-------------------------------------------------------------------------- + // sum up the bucket counts of prior threadblocks + //-------------------------------------------------------------------------- + + // blockbucket is an array of size 12-by-nblocks, held by row. The + // entry blockbucket [bucket * nblocks + t] holds the # of entries + // in the bucket (in range 0 to 11) found by threadblock t. + + + //__shared__ uint64_t offset [12] ; + uint64_t s_0=0; + uint64_t s_1=0; + uint64_t s_2=0; + uint64_t s_3=0; + uint64_t s_4=0; + uint64_t s_5=0; + uint64_t s_6=0; + uint64_t s_7=0; + uint64_t s_8=0; + uint64_t s_9=0; + uint64_t s_10=0; + uint64_t s_11=0; + + thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() ); + + //printf("block %d entering sum\n",blockIdx.x); + int tid = threadIdx.x + blockIdx.x*blockDim.x; + #define reduceBucket( B ) \ + for( tid = threadIdx.x + blockIdx.x*blockDim.x; \ + tid < nblocks; \ + tid += blockDim.x*gridDim.x) \ + { \ + s_ ## B += blockbucket[ B *nblocks +tid] ; \ + } \ + __syncthreads(); \ + s_ ## B = warp_ReduceSumPlus( tile, s_ ## B); + + reduceBucket( 0 ) + reduceBucket( 1 ) + reduceBucket( 2 ) + reduceBucket( 3 ) + reduceBucket( 4 ) + reduceBucket( 5 ) + reduceBucket( 6 ) + reduceBucket( 7 ) + reduceBucket( 8 ) + reduceBucket( 9 ) + reduceBucket( 10 ) + reduceBucket( 11 ) + + + //printf("summing blk,tid=%d,%d\n",blockIdx.x,threadIdx.x); + if (threadIdx.x ==0 ) + { + atomicAdd( (unsigned long long int*)&(offset[0]), s_0); + atomicAdd( (unsigned long long int*)&(offset[1]), s_1); + atomicAdd( (unsigned long long int*)&(offset[2]), s_2); + atomicAdd( (unsigned long long int*)&(offset[3]), s_3); + atomicAdd( (unsigned long long int*)&(offset[4]), s_4); + atomicAdd( (unsigned long long int*)&(offset[5]), s_5); + atomicAdd( (unsigned long long int*)&(offset[6]), s_6); + atomicAdd( (unsigned long long int*)&(offset[7]), s_7); + atomicAdd( (unsigned long long int*)&(offset[8]), s_8); + atomicAdd( (unsigned long long int*)&(offset[9]), s_9); + atomicAdd( (unsigned long long int*)&(offset[10]),s_10); + atomicAdd( (unsigned long long int*)&(offset[11]),s_11); + } + __syncthreads(); + + + + if( gridDim.x >= 12) + { + // Cumulative sum across blocks for each bucket + if (blockIdx.x <12) + blockBucketExclusiveSum( blockIdx.x, blockbucket, nblocks ) ; + } + else + { + if (blockIdx.x == 0) + { + blockBucketExclusiveSum( 0, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 1, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 2, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 3, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 4, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 5, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 6, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 7, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 8, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 9, blockbucket, nblocks ) ; + blockBucketExclusiveSum( 10, blockbucket, nblocks) ; + blockBucketExclusiveSum( 11, blockbucket, nblocks) ; + } + } + + + + + //-------------------------------------------------------------------------- + // last threadblock saves the cumsum of the 12 global buckets + //-------------------------------------------------------------------------- + /* do on cpu + if (blockIdx.x == 0) // gridDim.x - 1) + { + + // the last threadblock: compute all 12 global bucket sizes, and its + // cumulative sum + if (threadIdx.x == 0) + { + // the work in this last threadblock is single-threaded + uint64_t s = 0; + for (int bucket = 0 ; bucket < 12 ; bucket++) + { + // write the global cumsum of all buckets to the final global + // bucketp. bucketp [bucket] is the starting position in + // the bucket. + bucketp [bucket] = s ; + + // bucket_size is the total # of entries in this bucket, for + // all threadblocks. It has nearly been computed already, + // since offset [bucket] = sum (blockbucket (bucket,0:blockDim.x-1)). + // All that is left is to add the counts for the last threadblock.` + //int64_t global_bucket_size = offset [bucket]; + // + blockbucket [bucket * gridDim.x + blockIdx.x] ; + + //printf("bucketp[%d]= %ld\n",bucket, s); + // s is a cumulative sum of the global bucket sizes + s += offset[bucket]; // global_bucket_size ; + } + // The kth global bucket (for k = 0 to 11) appears in: + // bucket [bucketp [k]... bucketp [k+1]-1], + // so the end of the last bucket needs bucketp [12]. + bucketp [12] = (int64_t)s; + //printf("bucketp[12]= %ld\n", s); + // all entries in C now appear in the buckets. + // ASSERT (s == cnz) ; + } + __syncthreads ( ) ; + } + */ + +} // phase2 + + +__global__ +void GB_AxB_dot3_phase2end +( + // input, not modified: + int64_t *__restrict__ nanobuckets, // array of size 12-blockDim.x-by-nblocks + const int64_t *__restrict__ blockbucket, // global bucket count, of size 12*nblocks + // output: + const int64_t *__restrict__ bucketp, // global bucket cumsum, of size 13 + int64_t *__restrict__ bucket, // global buckets, of size cnz (== mnz) + const int64_t *__restrict__ offset, // global offsets, for each bucket + // inputs, not modified: + const GrB_Matrix C, // output matrix + const int64_t cnz // number of entries in C and M +) +{ + + + int64_t *__restrict__ Ci = C->i ; // for zombies, or bucket assignment + int64_t *__restrict__ Mp = C->p ; // for offset calculations + int64_t mnvec = C->nvec; + + //-------------------------------------------------------------------------- + // load and shift the nanobuckets for this thread block + //-------------------------------------------------------------------------- + + // The taskbucket for this threadblock is an array of size + // 12-by-blockDim.x, held by row. It forms a 2D array within the 3D + // nanobuckets array. + int64_t *__restrict__ taskbucket = nanobuckets + blockIdx.x * (12 * blockDim.x) ; + + //printf("block%d thd%d blockbucket= %ld\n", blockIdx.x, threadIdx.x, + // blockbucket[blockIdx.x*gridDim.x+blockIdx.x]); + + // Each thread in this threadblock owns one column of this taskbucket, for + // its set of 12 nanobuckets. The nanobuckets are a column of length 12, + // with stride equal to blockDim.x. + int64_t *__restrict__ nanobucket = taskbucket + threadIdx.x; + + // Each thread loads its 12 nanobucket values into registers. + #define LOAD_NANOBUCKET(bucket) \ + int64_t my_bucket_ ## bucket = \ + nanobucket [bucket * blockDim.x] \ + + blockbucket [bucket * gridDim.x + blockIdx.x]\ + + bucketp [bucket] ; + + LOAD_NANOBUCKET (0) ; + LOAD_NANOBUCKET (1) ; + LOAD_NANOBUCKET (2) ; + LOAD_NANOBUCKET (3) ; + LOAD_NANOBUCKET (4) ; + LOAD_NANOBUCKET (5) ; + LOAD_NANOBUCKET (6) ; + LOAD_NANOBUCKET (7) ; + LOAD_NANOBUCKET (8) ; + LOAD_NANOBUCKET (9) ; + LOAD_NANOBUCKET (10) ; + LOAD_NANOBUCKET (11) ; + + // Now each thread has an index into the global set of 12 buckets, + // held in bucket, of where to place its own entries. + + //-------------------------------------------------------------------------- + // construct the global buckets + //-------------------------------------------------------------------------- + + // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and + // C, which is the part of C operated on by this threadblock. + int64_t pfirst, plast ; + + /* + for ( int tid_global = threadIdx.x + blockIdx.x * blockDim.x ; + tid_global < (mnvec+7)/8 ; + tid_global += blockDim.x * gridDim.x) + */ + int chunk_max= (cnz + chunksize -1)/chunksize; + for ( int chunk = blockIdx.x; + chunk < chunk_max; + chunk += gridDim.x ) + { + + //GB_PARTITION (pfirst, plast, cnz, tid_global, (mnvec+7)/8 ) ; + pfirst = chunksize * chunk ; + plast = GB_IMIN( chunksize * (chunk+1), cnz ) ; + + int chunk_end; + if ( cnz > chunksize) chunk_end = GB_IMIN( chunksize, + cnz - chunksize*(chunk) ); + else chunk_end = cnz; + + // find the first vector of the slice for task blockIdx.x: the + // vector that owns the entry Ai [pfirst] and Ax [pfirst]. + //kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec) ; + + // find the last vector of the slice for task blockIdx.x: the + // vector that owns the entry Ai [plast-1] and Ax [plast-1]. + //klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec) ; + + + for ( int p = pfirst + threadIdx.x; + p < pfirst + chunk_end; + p += blockDim.x ) + { + // get the entry C(i,j), and extract its bucket. Then + // place the entry C(i,j) in the global bucket it belongs to. + + // TODO: these writes to global are not coalesced. Instead: each + // threadblock could buffer its writes to 12 buffers and when the + // buffers are full they can be written to global. + int ibucket = Ci[p] & 0xF; + //printf(" thd: %d p,Ci[p] = %ld,%ld,%d\n", threadIdx.x, p, Ci[p], irow ); + switch (ibucket) + { + case 0: bucket [my_bucket_0++ ] = p ; Ci[p] = Ci[p] >>4; break ; //unshift zombies + case 1: bucket [my_bucket_1++ ] = p ; break ; + case 2: bucket [my_bucket_2++ ] = p ; break ; + case 3: bucket [my_bucket_3++ ] = p ; break ; + case 4: bucket [my_bucket_4++ ] = p ; break ; + case 5: bucket [my_bucket_5++ ] = p ; break ; + case 6: bucket [my_bucket_6++ ] = p ; break ; + case 7: bucket [my_bucket_7++ ] = p ; break ; + case 8: bucket [my_bucket_8++ ] = p ; break ; + case 9: bucket [my_bucket_9++ ] = p ; break ; + case 10: bucket [my_bucket_10++] = p ; break ; + case 11: bucket [my_bucket_11++] = p ; break ; + default: break; + } + + } + //__syncthreads(); + } + +} + diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu new file mode 100644 index 0000000000..5211464e3b --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu @@ -0,0 +1,176 @@ + +//------------------------------------------------------------------------------ +// AxB_dot3_phase3_dndn.cu +//------------------------------------------------------------------------------ + +// This CUDA kernel produces the semi-ring product of two +// sparse matrices of types T_A and T_B and common index space size n, to a +// output matrix of type T_C. The matrices are sparse, with different numbers +// of non-zeros and different sparsity patterns. +// ie. we want to produce C = A'*B in the sense of the given semi-ring. + +// This version uses a simple warp-based dense dot product algorithm, when the +// vectors coming from both A and B are dense, for any size of N. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number +// of active threads = min( min(nzA, nzB), 32) + +// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. +// The work is to load the data, do the multiply and add work and finally +// reduce this data to a scalar, and write it to Cx[pair]. + +// int64_t start <- start of vector pairs for this kernel +// int64_t end <- end of vector pairs for this kernel +// int64_t *Bucket <- array of pair indices for all kernels +// GrB_Matrix C <- result matrix +// GrB_Matrix M <- mask matrix +// GrB_Matrix A <- input matrix A +// GrB_Matrix B <- input matrix B +// int sz <- size parameter (not used) + +#include +#include +#include +#include "matrix.h" +#include "mySemiRing.h" + +// Using tile size fixed at compile time, we don't need shared memory +#define tile_sz 32 + +using namespace cooperative_groups; + +template< typename T, int warp_sz> +__inline__ __device__ T warp_ReduceSum(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + T next = g.shfl_down( val, i) ; + val = GB_ADD( val, next ); + } + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T block_ReduceSum(thread_block g, T val, T Ident) +{ + static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + thread_block_tile tile = tiled_partition(g); + + // Each warp performs partial reduction + val = warp_ReduceSum< T, warpSize>(tile, val); + + if (lane==0) shared[wid] = val; // Write reduced value to shared memory + + //tile.sync(); // Wait for all partial reductions + + if (wid > 0 || gridDim.x == 1 ) return val; + + //read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : Ident ; + + if (wid==0) val = warp_ReduceSum< T, warpSize>(tile,val); //Final reduce within first warp + + return val; +} + + +template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> +__global__ void AxB_dot3_phase3_dndn +( + int64_t start, + int64_t end, + int64_t *Bucket, + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B, + int sz +) +{ + + T_A *Ax = (T_A*)A->x; + T_B *Bx = (T_B*)B->x; + T_C *Cx = (T_C*)C->x; + int64_t *Mi = M->i; + int64_t *Ci = C->i; + int64_t *Ap = A->p; + int64_t *Bp = B->p; + + // zombie count + int zc = 0; + int64_t pair_id; + + // total items to be inspected + int64_t nnzA = 0; + int64_t nnzB = 0; + int s = blockDim.x; + + // Main loop over pairs + for (pair_id = start + blockIdx.x; //warp per pair + pair_id < end; + pair_id += gridDim.x ){ + + int64_t i = Mi[pair_id]; + int64_t j = Ci[pair_id] >> 4; + + int64_t pA = Ap[i]; + int64_t xend = Ap[i+1]; + nnzA = xend - pA; + + int64_t pB = Bp[j]; + int64_t yend = Bp[j+1]; + nnzB = yend - pB; + + /* + if (threadIdx.x == 0 ){ + printf(" i,j = %d,%d nnz= %d xstart,end = %d,%d ystart,end = %d,%d\n", + (int)i,(int)j, (int)nnzA, (int)xstart,(int)xend, (int)ystart, (int)yend); + } + __syncthreads(); + */ + + + // convert global data pointer to the local pointer of this block + T_A aki; // *xdata = &Ax[xstart]; + T_B bkj; // *ydata = &Bx[ystart]; + T_Z cij; + + GB_GETA ( aki=(T_Z)Ax[pA+threadIdx.x] ) ; // aki = A(0,i) + GB_GETB ( bkj=(T_Z)Bx[pB+threadIdx.x] ) ; // bkj = B(0,j) + GB_C_MULT ( cij, aki, bkj ) ; // cij = aki * bkj + + for ( int tid = threadIdx.x + s; tid < nnzA; tid+= s) { + // cij += A(k,i) * B(k,j) + // GB_DOT_TERMINAL ( cij ) ; // break if cij == terminal + GB_GETA ( aki=(T_Z)Ax[pA+tid] ) ; // aki = A(k,i) + GB_GETB ( bkj=(T_Z)Bx[pB+tid] ) ; // bkj = B(k,j) + GB_MULTADD ( cij, aki, bkj ) ; // cij += aki * bkj + } + + + //-------------------------------------------------------------------------- + // reduce per-thread sums to a single scalar + //-------------------------------------------------------------------------- + thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() ); + cij = warp_ReduceSum ( tile, cij); + + // write result for this block to global mem + if (threadIdx.x == 0) + { + //printf("tid: %d final sum after reduce = %d\n", threadIdx.x, sum); + GB_PUTC( Cx[pair_id]=(T_C)cij ) ; + GB_PUTC( Ci[pair_id]=i ) ; + } + //__syncthreads ( ) ; + } + +} + diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit new file mode 100644 index 0000000000..e5564d863e --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit @@ -0,0 +1,178 @@ +const char* const templates_GB_jit_AxB_dot3_phase3_dndn_cu = "templates/GB_jit_AxB_dot3_phase3_dndn.cu\n" +"\n" +"//------------------------------------------------------------------------------\n" +"// AxB_dot3_phase3_dndn.cu \n" +"//------------------------------------------------------------------------------\n" +"\n" +"// This CUDA kernel produces the semi-ring product of two\n" +"// sparse matrices of types T_A and T_B and common index space size n, to a \n" +"// output matrix of type T_C. The matrices are sparse, with different numbers\n" +"// of non-zeros and different sparsity patterns. \n" +"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n" +"\n" +"// This version uses a simple warp-based dense dot product algorithm, when the\n" +"// vectors coming from both A and B are dense, for any size of N.\n" +"\n" +"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n" +"// threadblock, and the # of threadblocks is grid.x\n" +"\n" +"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n" +"// of active threads = min( min(nzA, nzB), 32) \n" +"\n" +"// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. \n" +"// The work is to load the data, do the multiply and add work and finally \n" +"// reduce this data to a scalar, and write it to Cx[pair].\n" +"\n" +"// int64_t start <- start of vector pairs for this kernel\n" +"// int64_t end <- end of vector pairs for this kernel\n" +"// int64_t *Bucket <- array of pair indices for all kernels \n" +"// GrB_Matrix C <- result matrix \n" +"// GrB_Matrix M <- mask matrix\n" +"// GrB_Matrix A <- input matrix A\n" +"// GrB_Matrix B <- input matrix B\n" +"// int sz <- size parameter (not used) \n" +"\n" +"#include \n" +"#include \n" +"#include \n" +"#include \"matrix.h\"\n" +"#include \"mySemiRing.h\"\n" +"\n" +"// Using tile size fixed at compile time, we don't need shared memory\n" +"#define tile_sz 32 \n" +"\n" +"using namespace cooperative_groups;\n" +"\n" +"template< typename T, int warp_sz>\n" +"__inline__ __device__ T warp_ReduceSum(thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2)\n" +" {\n" +" T next = g.shfl_down( val, i) ;\n" +" val = GB_ADD( val, next ); \n" +" }\n" +" return val; // note: only thread 0 will return full sum\n" +"}\n" +"\n" +"template\n" +"__inline__ __device__\n" +"T block_ReduceSum(thread_block g, T val, T Ident)\n" +"{\n" +" static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums\n" +" int lane = threadIdx.x % warpSize;\n" +" int wid = threadIdx.x / warpSize;\n" +" thread_block_tile tile = tiled_partition(g);\n" +"\n" +" // Each warp performs partial reduction\n" +" val = warp_ReduceSum< T, warpSize>(tile, val); \n" +"\n" +" if (lane==0) shared[wid] = val; // Write reduced value to shared memory\n" +"\n" +" //tile.sync(); // Wait for all partial reductions\n" +"\n" +" if (wid > 0 || gridDim.x == 1 ) return val;\n" +"\n" +" //read from shared memory only if that warp existed\n" +" val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : Ident ;\n" +"\n" +" if (wid==0) val = warp_ReduceSum< T, warpSize>(tile,val); //Final reduce within first warp\n" +"\n" +" return val;\n" +"}\n" +"\n" +"\n" +"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n" +"__global__ void AxB_dot3_phase3_dndn \n" +"(\n" +" int64_t start,\n" +" int64_t end,\n" +" int64_t *Bucket,\n" +" GrB_Matrix C,\n" +" GrB_Matrix M,\n" +" GrB_Matrix A,\n" +" GrB_Matrix B,\n" +" int sz\n" +")\n" +"{\n" +"\n" +" T_A *Ax = (T_A*)A->x;\n" +" T_B *Bx = (T_B*)B->x;\n" +" T_C *Cx = (T_C*)C->x;\n" +" int64_t *Mi = M->i;\n" +" int64_t *Ci = C->i;\n" +" int64_t *Ap = A->p;\n" +" int64_t *Bp = B->p;\n" +"\n" +" // zombie count\n" +" int zc = 0;\n" +" int64_t pair_id;\n" +"\n" +" // total items to be inspected\n" +" int64_t nnzA = 0;\n" +" int64_t nnzB = 0;\n" +" int s = blockDim.x;\n" +"\n" +" // Main loop over pairs \n" +" for (pair_id = start + blockIdx.x; //warp per pair \n" +" pair_id < end; \n" +" pair_id += gridDim.x ){\n" +"\n" +" int64_t i = Mi[pair_id];\n" +" int64_t j = Ci[pair_id] >> 4;\n" +"\n" +" int64_t pA = Ap[i];\n" +" int64_t xend = Ap[i+1];\n" +" nnzA = xend - pA;\n" +"\n" +" int64_t pB = Bp[j]; \n" +" int64_t yend = Bp[j+1]; \n" +" nnzB = yend - pB;\n" +"\n" +" /*\n" +" if (threadIdx.x == 0 ){\n" +" printf(\" i,j = %d,%d nnz= %d xstart,end = %d,%d ystart,end = %d,%d\\n\",\n" +" (int)i,(int)j, (int)nnzA, (int)xstart,(int)xend, (int)ystart, (int)yend);\n" +" }\n" +" __syncthreads(); \n" +" */\n" +"\n" +" \n" +" // convert global data pointer to the local pointer of this block\n" +" T_A aki; // *xdata = &Ax[xstart]; \n" +" T_B bkj; // *ydata = &Bx[ystart];\n" +" T_Z cij;\n" +"\n" +" GB_GETA ( aki=(T_Z)Ax[pA+threadIdx.x] ) ; // aki = A(0,i)\n" +" GB_GETB ( bkj=(T_Z)Bx[pB+threadIdx.x] ) ; // bkj = B(0,j)\n" +" GB_C_MULT ( cij, aki, bkj ) ; // cij = aki * bkj\n" +"\n" +" for ( int tid = threadIdx.x + s; tid < nnzA; tid+= s) { \n" +" // cij += A(k,i) * B(k,j)\n" +" // GB_DOT_TERMINAL ( cij ) ; // break if cij == terminal\n" +" GB_GETA ( aki=(T_Z)Ax[pA+tid] ) ; // aki = A(k,i)\n" +" GB_GETB ( bkj=(T_Z)Bx[pB+tid] ) ; // bkj = B(k,j)\n" +" GB_MULTADD ( cij, aki, bkj ) ; // cij += aki * bkj\n" +" }\n" +"\n" +"\n" +" //--------------------------------------------------------------------------\n" +" // reduce per-thread sums to a single scalar\n" +" //--------------------------------------------------------------------------\n" +" thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() );\n" +" cij = warp_ReduceSum ( tile, cij);\n" +"\n" +" // write result for this block to global mem\n" +" if (threadIdx.x == 0)\n" +" {\n" +" //printf(\"tid: %d final sum after reduce = %d\\n\", threadIdx.x, sum);\n" +" GB_PUTC( Cx[pair_id]=(T_C)cij ) ;\n" +" GB_PUTC( Ci[pair_id]=i ) ;\n" +" }\n" +" //__syncthreads ( ) ;\n" +" }\n" +"\n" +"}\n" +"\n" +; diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu new file mode 100644 index 0000000000..825a02e2ab --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu @@ -0,0 +1,300 @@ +//------------------------------------------------------------------------------ +// AxB_dot3_phase3_mp.cu +//------------------------------------------------------------------------------ + +// This CUDA kernel produces the semi-ring product of two +// sparse matrices of types T_A and T_B and common index space size n, to a +// output matrix of type T_C. The matrices are sparse, with different numbers +// of non-zeros and different sparsity patterns. +// ie. we want to produce C = A'*B in the sense of the given semi-ring. + +// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are +// relatively close in size, neither is very spare nor dense, for any size of N. +// Handles arbitrary sparsity patterns with guaranteed load balance. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number +// of active threads = min( min(g_xnz, g_ynz), 32) + +// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. Its job +// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot +// product on those items in the intersection, and finally reduce this data to a scalar, +// on exit write it to g_odata [b]. + +// int64_t start <- start of vector pairs for this kernel +// int64_t end <- end of vector pairs for this kernel +// int64_t *Bucket <- array of pair indices for all kernels +// matrix *C <- result matrix +// matrix *M <- mask matrix +// matrix *A <- input matrix A +// matrix *B <- input matrix B +#include +#include +#include +#include "mySemiRing.h" +#include "matrix.h" + +// Using tile size fixed at compile time, we don't need shared memory +#define tile_sz 32 + +using namespace cooperative_groups; + +template< typename T, int warp_sz> +__device__ __inline__ +T GB_reduce_sum(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + T next = g.shfl_down( val, i); + val = GB_ADD( val, next ) ; + } + return val; +} + +template< typename T, int warp_sz> +__device__ __inline__ +T reduce_plus(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + val += g.shfl_down( val, i) ; + } + return val; // note: only thread 0 will return full sum and flag value +} + +#define intersects_per_thread 8 + +template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> +__global__ void AxB_dot3_phase3_mp +( + int64_t start, + int64_t end, + int64_t *Bucket, + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B, + int sz +) +{ + + T_A *Ax = (T_A*)A->x; + T_B *Bx = (T_B*)B->x; + T_C *Cx = (T_C*)C->x; + int64_t *Ci = C->i; + int64_t *Mi = M->i; + int64_t *Ai = A->i; + int64_t *Bi = B->i; + int64_t *Ap = A->p; + int64_t *Bp = B->p; + + + // zombie count + int zc = 0; + + int64_t pair_id; + + // set thread ID + int tid_global = threadIdx.x+ blockDim.x* blockIdx.x; + int tid = threadIdx.x; + + int b = blockIdx.x ; + + // total items to be inspected + int64_t nnzA = 0; + int64_t nnzB = 0; + int64_t n_intersect = 0; + + thread_block_tile tile = tiled_partition( this_thread_block()); + + int parts = blockDim.x; //(n_intersect+ intersects_per_thread -1)/ intersects_per_thread; + + // int has_zombies = 0 ; + + // Main loop over pairs + for (pair_id = start+ blockIdx.x; //warp per pair + pair_id < end; + pair_id += gridDim.x ) + { + + int64_t i = Mi[pair_id]; + int64_t j = Ci[pair_id] >> 4; + + int64_t xstart = Ap[i]; + int64_t xend = Ap[i+1]; + nnzA = xend - xstart; + + int64_t ystart = Bp[j]; + int64_t yend = Bp[j+1]; + nnzB = yend - ystart; + + n_intersect = GB_IMIN( xend -xstart, yend -ystart); + /* + if (threadIdx.x ==0 ) { + printf("block %d doing dot %lld i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j); + } + */ + //we want more than one intersection per thread + int64_t nxy = nnzA + nnzB; + + int work_per_thread = (nxy +parts -1)/parts; + int diag = GB_IMIN( work_per_thread*tid, nxy); + int diag_end = GB_IMIN( diag + work_per_thread, nxy); + //printf(" thd%d parts = %u wpt = %u diag, diag_end = %u,%u\n",tid, parts, work_per_thread, diag, diag_end); + + int x_min = GB_IMAX( (int)(diag - nnzB), 0); + int x_max = GB_IMIN( diag, nnzA); + + //printf("start thd%u x_min = %u x_max = %u\n", tid_global, x_min,x_max); + while ( x_min < x_max) { //binary search for correct diag break + int pivot = (x_min +x_max)/2; + if ( Ai[pivot + xstart] < Bi[ diag -pivot -1 + ystart]) { + x_min = pivot +1; + } + else { + x_max = pivot; + } + } + int xcoord = x_min; + int ycoord = diag -x_min -1; + if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+xstart] == Bi[ycoord+ystart]) ) { + diag--; //adjust for intersection incrementing both pointers + } + // two start points are known now + int tx_start = xcoord +xstart; + int ty_start = diag -xcoord +ystart; + + //if (x_start != y_start) + // printf("start thd%u xs,ys = %i,%i\n", tid_global, x_start, y_start); + + x_min = GB_IMAX( (int)(diag_end - nnzB), 0); + x_max = GB_IMIN( diag_end, nnzA); + + while ( x_min < x_max) { + int pivot = (x_min +x_max)/2; + //printf("thd%u pre_sw piv=%u diag_e = %u xmin,xmax=%u,%u\n", tid_global, pivot, diag_end,x_min, x_max); + if ( Ai[pivot+ xstart] < Bi[ diag_end -pivot -1 +ystart]) { + x_min = pivot +1; + } + else { + x_max = pivot; + } + //printf("thd%u piv=%u xmin,xmax = %u,%u\n", tid_global, pivot, x_min, x_max); + } + xcoord = x_min; + ycoord = diag_end -x_min -1; + if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +xstart] == Bi[ycoord + ystart]) ) { + diag--; //adjust for intersection incrementing both pointers + } + // two end points are known now + int tx_end = xcoord +xstart; + int ty_end = diag_end - xcoord + ystart; + + T_A aki; + T_B bkj; + T_Z cij = GB_IDENTITY ; + + // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists. + // just check if cij > 0 + + int cij_exists = 0 ; + //printf(" thd%u has init value %f\n",tid, cij); + + //merge-path dot product + int k = tx_start; + int l = ty_start; + while ( k < tx_end && l < ty_end ) + { + if (Ai [k] == Bi [l]) + { + GB_GETA ( aki=(T_Z)Ax[k] ) ; + GB_GETB ( bkj=(T_Z)Bx[l] ) ; + if (cij_exists) + { + T_Z t = GB_MULT( (T_Z)aki, (T_Z)bkj ); + GB_ADD_F (cij, t ) ; + //printf(" thd%d ix at %lld cij += %d * %d \n", tid_global, Ai[k], aki, bkj); + } + else + { + cij_exists = 1 ; + cij = GB_MULT ( (T_Z)aki, (T_Z)bkj ) ; + //printf(" thd%d ix at %lld cij = %d * %d \n", tid_global, Ai[k], Ax[k], Bx[l]); + } + // TODO check terminal condition + k+= 1; + l+= 1; + //printf(" block%u work value = %d, exists = %d\n", b, cij, cij_exists); + } + else + { + k += ( Ai[k] < Bi[l] ) ; + l += ( Ai[k] > Bi[l] ) ; + } + } + + //tile.sync( ) ; + //-------------------------------------------------------------------------- + // reduce sum per-thread values to a single scalar, get OR of flag + //-------------------------------------------------------------------------- + /* + if (tid == 0) + { + printf ("reduce %d : %d exists = %d\n", b, cij, cij_exists) ; + } + __syncthreads(); + */ + + // Do vote here for control. + cij_exists = tile.any( cij_exists); + //tile.sync(); + + if (cij_exists) + { + cij = GB_reduce_sum( tile, cij ); + + } + // else has_zombies = 1; + + + //__syncthreads(); + //tile.sync( ); + // write result for this block to global mem + if (tid == 0) + { + //printf ("final %d : %d exists = %d\n", b, cij, cij_exists) ; + if (cij_exists) + { + //printf(" cij = %d\n", cij); + GB_PUTC ( Cx[pair_id]=(T_C)cij ) ; + GB_PUTC ( Ci[pair_id]=i ) ; + } + else + { + //printf(" dot %d is a zombie\n", pair_id); + zc++; + GB_PUTC ( Ci[pair_id]=GB_FLIP (i) ) ; + } + } + //__syncthreads(); + } + +//-------------------------------------------------------------------------- + + if( tid ==0 && zc > 0) + { + //printf("warp %d zombie count = %d\n", blockIdx.x, zc); + atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc); + //printf(" Czombie = %lld\n",C->nzombies); + } + + //__syncthreads(); + +} + diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit new file mode 100644 index 0000000000..ed9c569df5 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit @@ -0,0 +1,302 @@ +const char* const templates_GB_jit_AxB_dot3_phase3_mp_cu = "templates/GB_jit_AxB_dot3_phase3_mp.cu\n" +"//------------------------------------------------------------------------------\n" +"// AxB_dot3_phase3_mp.cu \n" +"//------------------------------------------------------------------------------\n" +"\n" +"// This CUDA kernel produces the semi-ring product of two\n" +"// sparse matrices of types T_A and T_B and common index space size n, to a \n" +"// output matrix of type T_C. The matrices are sparse, with different numbers\n" +"// of non-zeros and different sparsity patterns. \n" +"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n" +"\n" +"// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are \n" +"// relatively close in size, neither is very spare nor dense, for any size of N.\n" +"// Handles arbitrary sparsity patterns with guaranteed load balance.\n" +"\n" +"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n" +"// threadblock, and the # of threadblocks is grid.x\n" +"\n" +"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n" +"// of active threads = min( min(g_xnz, g_ynz), 32) \n" +"\n" +"// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. Its job\n" +"// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot\n" +"// product on those items in the intersection, and finally reduce this data to a scalar, \n" +"// on exit write it to g_odata [b].\n" +"\n" +"// int64_t start <- start of vector pairs for this kernel\n" +"// int64_t end <- end of vector pairs for this kernel\n" +"// int64_t *Bucket <- array of pair indices for all kernels \n" +"// matrix *C <- result matrix \n" +"// matrix *M <- mask matrix\n" +"// matrix *A <- input matrix A\n" +"// matrix *B <- input matrix B\n" +"#include \n" +"#include \n" +"#include \n" +"#include \"mySemiRing.h\"\n" +"#include \"matrix.h\"\n" +"\n" +"// Using tile size fixed at compile time, we don't need shared memory\n" +"#define tile_sz 32 \n" +"\n" +"using namespace cooperative_groups;\n" +"\n" +"template< typename T, int warp_sz>\n" +"__device__ __inline__ \n" +"T GB_reduce_sum(thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2)\n" +" {\n" +" T next = g.shfl_down( val, i);\n" +" val = GB_ADD( val, next ) ;\n" +" }\n" +" return val;\n" +"}\n" +"\n" +"template< typename T, int warp_sz>\n" +"__device__ __inline__ \n" +"T reduce_plus(thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2)\n" +" {\n" +" val += g.shfl_down( val, i) ;\n" +" }\n" +" return val; // note: only thread 0 will return full sum and flag value\n" +"}\n" +"\n" +"#define intersects_per_thread 8\n" +"\n" +"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> \n" +"__global__ void AxB_dot3_phase3_mp\n" +"(\n" +" int64_t start,\n" +" int64_t end,\n" +" int64_t *Bucket,\n" +" GrB_Matrix C,\n" +" GrB_Matrix M,\n" +" GrB_Matrix A,\n" +" GrB_Matrix B,\n" +" int sz\n" +")\n" +"{\n" +"\n" +" T_A *Ax = (T_A*)A->x;\n" +" T_B *Bx = (T_B*)B->x;\n" +" T_C *Cx = (T_C*)C->x;\n" +" int64_t *Ci = C->i;\n" +" int64_t *Mi = M->i;\n" +" int64_t *Ai = A->i;\n" +" int64_t *Bi = B->i;\n" +" int64_t *Ap = A->p;\n" +" int64_t *Bp = B->p;\n" +"\n" +"\n" +" // zombie count\n" +" int zc = 0;\n" +"\n" +" int64_t pair_id;\n" +"\n" +" // set thread ID\n" +" int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n" +" int tid = threadIdx.x;\n" +"\n" +" int b = blockIdx.x ;\n" +"\n" +" // total items to be inspected\n" +" int64_t nnzA = 0;\n" +" int64_t nnzB = 0;\n" +" int64_t n_intersect = 0;\n" +"\n" +" thread_block_tile tile = tiled_partition( this_thread_block());\n" +"\n" +" int parts = blockDim.x; //(n_intersect+ intersects_per_thread -1)/ intersects_per_thread; \n" +"\n" +" // int has_zombies = 0 ;\n" +"\n" +" // Main loop over pairs \n" +" for (pair_id = start+ blockIdx.x; //warp per pair \n" +" pair_id < end; \n" +" pair_id += gridDim.x )\n" +" {\n" +"\n" +" int64_t i = Mi[pair_id];\n" +" int64_t j = Ci[pair_id] >> 4;\n" +"\n" +" int64_t xstart = Ap[i];\n" +" int64_t xend = Ap[i+1];\n" +" nnzA = xend - xstart;\n" +"\n" +" int64_t ystart = Bp[j]; \n" +" int64_t yend = Bp[j+1]; \n" +" nnzB = yend - ystart;\n" +"\n" +" n_intersect = GB_IMIN( xend -xstart, yend -ystart); \n" +" /* \n" +" if (threadIdx.x ==0 ) {\n" +" printf(\"block %d doing dot %lld i,j= %lld,%lld\\n\", blockIdx.x, pair_id, i, j);\n" +" }\n" +" */\n" +" //we want more than one intersection per thread\n" +" int64_t nxy = nnzA + nnzB;\n" +"\n" +" int work_per_thread = (nxy +parts -1)/parts;\n" +" int diag = GB_IMIN( work_per_thread*tid, nxy);\n" +" int diag_end = GB_IMIN( diag + work_per_thread, nxy);\n" +" //printf(\" thd%d parts = %u wpt = %u diag, diag_end = %u,%u\\n\",tid, parts, work_per_thread, diag, diag_end); \n" +"\n" +" int x_min = GB_IMAX( (int)(diag - nnzB), 0);\n" +" int x_max = GB_IMIN( diag, nnzA);\n" +"\n" +" //printf(\"start thd%u x_min = %u x_max = %u\\n\", tid_global, x_min,x_max);\n" +" while ( x_min < x_max) { //binary search for correct diag break\n" +" int pivot = (x_min +x_max)/2;\n" +" if ( Ai[pivot + xstart] < Bi[ diag -pivot -1 + ystart]) {\n" +" x_min = pivot +1;\n" +" }\n" +" else {\n" +" x_max = pivot;\n" +" }\n" +" }\n" +" int xcoord = x_min;\n" +" int ycoord = diag -x_min -1;\n" +" if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+xstart] == Bi[ycoord+ystart]) ) { \n" +" diag--; //adjust for intersection incrementing both pointers \n" +" }\n" +" // two start points are known now\n" +" int tx_start = xcoord +xstart;\n" +" int ty_start = diag -xcoord +ystart; \n" +"\n" +" //if (x_start != y_start)\n" +" // printf(\"start thd%u xs,ys = %i,%i\\n\", tid_global, x_start, y_start);\n" +"\n" +" x_min = GB_IMAX( (int)(diag_end - nnzB), 0);\n" +" x_max = GB_IMIN( diag_end, nnzA);\n" +"\n" +" while ( x_min < x_max) {\n" +" int pivot = (x_min +x_max)/2;\n" +" //printf(\"thd%u pre_sw piv=%u diag_e = %u xmin,xmax=%u,%u\\n\", tid_global, pivot, diag_end,x_min, x_max);\n" +" if ( Ai[pivot+ xstart] < Bi[ diag_end -pivot -1 +ystart]) {\n" +" x_min = pivot +1;\n" +" }\n" +" else {\n" +" x_max = pivot;\n" +" }\n" +" //printf(\"thd%u piv=%u xmin,xmax = %u,%u\\n\", tid_global, pivot, x_min, x_max);\n" +" }\n" +" xcoord = x_min;\n" +" ycoord = diag_end -x_min -1;\n" +" if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +xstart] == Bi[ycoord + ystart]) ) { \n" +" diag--; //adjust for intersection incrementing both pointers \n" +" }\n" +" // two end points are known now\n" +" int tx_end = xcoord +xstart; \n" +" int ty_end = diag_end - xcoord + ystart; \n" +"\n" +" T_A aki;\n" +" T_B bkj;\n" +" T_Z cij = GB_IDENTITY ;\n" +"\n" +" // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.\n" +" // just check if cij > 0\n" +"\n" +" int cij_exists = 0 ;\n" +" //printf(\" thd%u has init value %f\\n\",tid, cij);\n" +"\n" +" //merge-path dot product\n" +" int k = tx_start;\n" +" int l = ty_start;\n" +" while ( k < tx_end && l < ty_end )\n" +" {\n" +" if (Ai [k] == Bi [l])\n" +" {\n" +" GB_GETA ( aki=(T_Z)Ax[k] ) ;\n" +" GB_GETB ( bkj=(T_Z)Bx[l] ) ;\n" +" if (cij_exists)\n" +" {\n" +" T_Z t = GB_MULT( (T_Z)aki, (T_Z)bkj );\n" +" GB_ADD_F (cij, t ) ;\n" +" //printf(\" thd%d ix at %lld cij += %d * %d \\n\", tid_global, Ai[k], aki, bkj);\n" +" }\n" +" else\n" +" {\n" +" cij_exists = 1 ;\n" +" cij = GB_MULT ( (T_Z)aki, (T_Z)bkj ) ;\n" +" //printf(\" thd%d ix at %lld cij = %d * %d \\n\", tid_global, Ai[k], Ax[k], Bx[l]);\n" +" }\n" +" // TODO check terminal condition\n" +" k+= 1;\n" +" l+= 1;\n" +" //printf(\" block%u work value = %d, exists = %d\\n\", b, cij, cij_exists);\n" +" }\n" +" else\n" +" {\n" +" k += ( Ai[k] < Bi[l] ) ;\n" +" l += ( Ai[k] > Bi[l] ) ;\n" +" }\n" +" }\n" +"\n" +" //tile.sync( ) ;\n" +" //--------------------------------------------------------------------------\n" +" // reduce sum per-thread values to a single scalar, get OR of flag\n" +" //--------------------------------------------------------------------------\n" +" /*\n" +" if (tid == 0)\n" +" {\n" +" printf (\"reduce %d : %d exists = %d\\n\", b, cij, cij_exists) ;\n" +" }\n" +" __syncthreads();\n" +" */\n" +"\n" +" // Do vote here for control.\n" +" cij_exists = tile.any( cij_exists);\n" +" //tile.sync();\n" +"\n" +" if (cij_exists)\n" +" {\n" +" cij = GB_reduce_sum( tile, cij );\n" +" \n" +" }\n" +" // else has_zombies = 1;\n" +"\n" +"\n" +" //__syncthreads();\n" +" //tile.sync( );\n" +" // write result for this block to global mem\n" +" if (tid == 0)\n" +" {\n" +" //printf (\"final %d : %d exists = %d\\n\", b, cij, cij_exists) ;\n" +" if (cij_exists)\n" +" {\n" +" //printf(\" cij = %d\\n\", cij);\n" +" GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;\n" +" GB_PUTC ( Ci[pair_id]=i ) ;\n" +" }\n" +" else\n" +" {\n" +" //printf(\" dot %d is a zombie\\n\", pair_id);\n" +" zc++;\n" +" GB_PUTC ( Ci[pair_id]=GB_FLIP (i) ) ;\n" +" }\n" +" }\n" +" //__syncthreads(); \n" +" }\n" +"\n" +"//--------------------------------------------------------------------------\n" +"\n" +" if( tid ==0 && zc > 0)\n" +" {\n" +" //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n" +" atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n" +" //printf(\" Czombie = %lld\\n\",C->zombie_count);\n" +" }\n" +"\n" +" //__syncthreads();\n" +"\n" +"}\n" +"\n" +; diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu new file mode 100644 index 0000000000..537c489fb8 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu @@ -0,0 +1,136 @@ +//****************************************************************************** +// Sparse dot products in batch form, sparse - dense case. +// Each thread in this kernel is responsible for m vector-pairs(x,y), +// m = 256/sz, where sz is in {4, 16, 64, 256} +// We know each non-zero on the sparse side will hit a dense value. +// Template on +// Parameters: + +// int64_t start <- beginning of bucket +// int64_t end <- end of bucket +// int64_t *Bucket <- index of each pair in this bucket +// matrix *C <- C result matrix +// matrix *M <- Mask matrix +// matrix *A <- A matrix to multiply, sparse +// matrix *B <- B matrix to multiply, dense in sparse format? +// int sz <- size hint for smaller vector +//****************************************************************************** +#include +#include +#include +#include "mySemiRing.h" +#include "matrix.h" + +template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> +__global__ void AxB_dot3_phase3_spdn +( + int64_t start, + int64_t end, + int64_t *Bucket, + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B, + int sz +) +{ + + T_A *Ax = (T_A*)A->x; + T_B *Bx = (T_B*)B->x; + T_C *Cx = (T_C*)C->x; + int64_t *Ci = C->i; + int64_t *Mi = M->i; + int64_t *Ai = A->i; + int64_t *Bi = B->i; + int64_t *Ap = A->p; + int64_t *Bp = B->p; + + // sz = expected non-zeros per dot + int m = 256/sz; + int nvecs = end - start; + int dpt = nvecs/32; + m = dpt < m ? dpt : m; + //if( threadIdx.x ==0) + // printf("thd:%d %d dots/thrd, nvecs = %d blockDim=%d\n",threadIdx.x, sz, nvecs, blockDim.x); + //__syncthreads(); + int dots = (nvecs +m -1)/m; + int zc = 0; + + for ( int tid= threadIdx.x +blockDim.x*blockIdx.x; + tid < dots; + tid += blockDim.x * gridDim.x) { + int pair_id, im; + //if (threadIdx.x ==0) + // printf("thd%u pi=%lld\n",tid, start+threadIdx.x); + // __syncthreads(); + + for (pair_id = start+tid, im = 0; + im < m && pair_id < end; + ++im, pair_id += dots ){ + + int64_t i = Mi[pair_id]; + int64_t j = Ci[pair_id] >> 4; + //if (threadIdx.x ==0) + // printf("thd%u i,j=%lld,%lld\n",tid, i,j); + // __syncthreads(); + + // printf("thd%d pi=%d xn=%lld yn=%lld\n",tid, pair_id, + // A->p[i+1]- A->p[i], + // B->p[j+1]- B->p[j]); + + int64_t pA = Ap[i]; + int64_t pA_end = Ap[i+1]; + int64_t nnzA = pA_end - pA; + int64_t pB = Bp[i]; + int64_t pB_end = Bp[i+1]; + int64_t nnzB = pB_end - pB; + T_A aki; + T_B bkj; + T_Z cij; + + if( nnzA == A->vlen) // A is dense + { + int64_t k = Bi [pB] ; // first row index of B(:,j) + // cij = A(k,i) * B(k,j) + GB_GETA ( aki=(T_Z)Ax[pA+k] ) ; // aki = A(k,i) + GB_GETB ( bkj=(T_Z)Bx[pB] ) ; // bkj = B(k,j) + GB_C_MULT ( cij, aki, bkj ) ; // cij = aki * bkj + + for (int64_t p = pB+1 ; p < pB_end ; p++) + { + //GB_DOT_TERMINAL (cij) ; // break if cij == terminal + int64_t k = Bi [p] ; // next row index of B(:,j) + // cij += A(k,i) * B(k,j) + GB_GETA ( aki=(T_Z)Ax[pA+k] ) ; // aki = A(k,i) + GB_GETB ( bkj=(T_Z)Bx[p] ) ; // bkj = B(k,j) + GB_MULTADD ( cij, aki, bkj ) ; // cij += aki * bkj + } + + } + if( nnzB == B->vlen) // B is dense + { + int64_t k = Ai [pA] ; // first row index of A(:,i) + // cij = A(k,i) * B(k,j) + GB_GETA ( aki=(T_Z)Ax[ pA ] ) ; // aki = A(k,i) + GB_GETB ( bkj=(T_Z)Bx[ pB+k ] ) ; // bkj = B(k,j) + GB_C_MULT ( cij, aki, bkj) ; // cij = aki * bkj + + for (int64_t p = pA+1 ; p < pA_end ; p++) + { + //GB_DOT_TERMINAL (cij) ; // break if cij == terminal + int64_t k = Ai [p] ; // next row index of A(:,i) + // cij += A(k,i) * B(k,j) + GB_GETA ( aki=(T_Z)Ax[ p ] ) ; // aki = A(k,i) + GB_GETB ( bkj=(T_Z)Bx[ pB+k] ) ; // bkj = B(k,j) + GB_MULTADD ( cij, aki, bkj) ; // cij += aki * bkj + } + } + + GB_PUTC( Ci[pair_id]=i ) ; + GB_PUTC( Cx[pair_id]=cij ) ; + + } + + } + +} diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit new file mode 100644 index 0000000000..d057e78a6c --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit @@ -0,0 +1,138 @@ +const char* const templates_GB_jit_AxB_dot3_phase3_spdn_cu = "templates/GB_jit_AxB_dot3_phase3_spdn.cu\n" +"//******************************************************************************\n" +"// Sparse dot products in batch form, sparse - dense case. \n" +"// Each thread in this kernel is responsible for m vector-pairs(x,y), \n" +"// m = 256/sz, where sz is in {4, 16, 64, 256}\n" +"// We know each non-zero on the sparse side will hit a dense value.\n" +"// Template on \n" +"// Parameters:\n" +"\n" +"// int64_t start <- beginning of bucket \n" +"// int64_t end <- end of bucket\n" +"// int64_t *Bucket <- index of each pair in this bucket\n" +"// matrix *C <- C result matrix \n" +"// matrix *M <- Mask matrix \n" +"// matrix *A <- A matrix to multiply, sparse \n" +"// matrix *B <- B matrix to multiply, dense in sparse format? \n" +"// int sz <- size hint for smaller vector\n" +"//******************************************************************************\n" +"#include \n" +"#include \n" +"#include \n" +"#include \"mySemiRing.h\"\n" +"#include \"matrix.h\"\n" +"\n" +"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n" +"__global__ void AxB_dot3_phase3_spdn\n" +"( \n" +" int64_t start, \n" +" int64_t end,\n" +" int64_t *Bucket, \n" +" GrB_Matrix C, \n" +" GrB_Matrix M, \n" +" GrB_Matrix A, \n" +" GrB_Matrix B,\n" +" int sz \n" +")\n" +"{\n" +"\n" +" T_A *Ax = (T_A*)A->x;\n" +" T_B *Bx = (T_B*)B->x;\n" +" T_C *Cx = (T_C*)C->x;\n" +" int64_t *Ci = C->i;\n" +" int64_t *Mi = M->i;\n" +" int64_t *Ai = A->i;\n" +" int64_t *Bi = B->i;\n" +" int64_t *Ap = A->p;\n" +" int64_t *Bp = B->p;\n" +"\n" +" // sz = expected non-zeros per dot \n" +" int m = 256/sz;\n" +" int nvecs = end - start;\n" +" int dpt = nvecs/32;\n" +" m = dpt < m ? dpt : m;\n" +" //if( threadIdx.x ==0)\n" +" // printf(\"thd:%d %d dots/thrd, nvecs = %d blockDim=%d\\n\",threadIdx.x, sz, nvecs, blockDim.x);\n" +" //__syncthreads();\n" +" int dots = (nvecs +m -1)/m; \n" +" int zc = 0;\n" +" \n" +" for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;\n" +" tid < dots;\n" +" tid += blockDim.x * gridDim.x) {\n" +" int pair_id, im; \n" +" //if (threadIdx.x ==0)\n" +" // printf(\"thd%u pi=%lld\\n\",tid, start+threadIdx.x); \n" +" // __syncthreads();\n" +"\n" +" for (pair_id = start+tid, im = 0; \n" +" im < m && pair_id < end; \n" +" ++im, pair_id += dots ){\n" +"\n" +" int64_t i = Mi[pair_id];\n" +" int64_t j = Ci[pair_id] >> 4;\n" +" //if (threadIdx.x ==0)\n" +" // printf(\"thd%u i,j=%lld,%lld\\n\",tid, i,j); \n" +" // __syncthreads();\n" +" \n" +" // printf(\"thd%d pi=%d xn=%lld yn=%lld\\n\",tid, pair_id, \n" +" // A->p[i+1]- A->p[i],\n" +" // B->p[j+1]- B->p[j]);\n" +"\n" +" int64_t pA = Ap[i];\n" +" int64_t pA_end = Ap[i+1];\n" +" int64_t nnzA = pA_end - pA;\n" +" int64_t pB = Bp[i];\n" +" int64_t pB_end = Bp[i+1];\n" +" int64_t nnzB = pB_end - pB;\n" +" T_A aki;\n" +" T_B bkj;\n" +" T_Z cij;\n" +"\n" +" if( nnzA == A->vlen) // A is dense\n" +" {\n" +" int64_t k = Bi [pB] ; // first row index of B(:,j)\n" +" // cij = A(k,i) * B(k,j)\n" +" GB_GETA ( aki=(T_Z)Ax[pA+k] ) ; // aki = A(k,i)\n" +" GB_GETB ( bkj=(T_Z)Bx[pB] ) ; // bkj = B(k,j)\n" +" GB_C_MULT ( cij, aki, bkj ) ; // cij = aki * bkj\n" +"\n" +" for (int64_t p = pB+1 ; p < pB_end ; p++)\n" +" { \n" +" //GB_DOT_TERMINAL (cij) ; // break if cij == terminal\n" +" int64_t k = Bi [p] ; // next row index of B(:,j)\n" +" // cij += A(k,i) * B(k,j)\n" +" GB_GETA ( aki=(T_Z)Ax[pA+k] ) ; // aki = A(k,i)\n" +" GB_GETB ( bkj=(T_Z)Bx[p] ) ; // bkj = B(k,j)\n" +" GB_MULTADD ( cij, aki, bkj ) ; // cij += aki * bkj\n" +" }\n" +"\n" +" }\n" +" if( nnzB == B->vlen) // B is dense\n" +" {\n" +" int64_t k = Ai [pA] ; // first row index of A(:,i)\n" +" // cij = A(k,i) * B(k,j)\n" +" GB_GETA ( aki=(T_Z)Ax[ pA ] ) ; // aki = A(k,i)\n" +" GB_GETB ( bkj=(T_Z)Bx[ pB+k ] ) ; // bkj = B(k,j)\n" +" GB_C_MULT ( cij, aki, bkj) ; // cij = aki * bkj\n" +"\n" +" for (int64_t p = pA+1 ; p < pA_end ; p++)\n" +" { \n" +" //GB_DOT_TERMINAL (cij) ; // break if cij == terminal\n" +" int64_t k = Ai [p] ; // next row index of A(:,i)\n" +" // cij += A(k,i) * B(k,j)\n" +" GB_GETA ( aki=(T_Z)Ax[ p ] ) ; // aki = A(k,i)\n" +" GB_GETB ( bkj=(T_Z)Bx[ pB+k] ) ; // bkj = B(k,j)\n" +" GB_MULTADD ( cij, aki, bkj) ; // cij += aki * bkj\n" +" }\n" +" }\n" +"\n" +" GB_PUTC( Ci[pair_id]=i ) ;\n" +" GB_PUTC( Cx[pair_id]=cij ) ;\n" +" \n" +" }\n" +" \n" +" }\n" +" \n" +"}\n" +; diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu new file mode 100644 index 0000000000..3ed255b7e3 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu @@ -0,0 +1,234 @@ +//------------------------------------------------------------------------------ +// spGEMM_very_sparse_sparse.cu +//------------------------------------------------------------------------------ + +// The spGEM_vssp CUDA kernel produces the semi-ring product of two +// sparse matrices of types T_A and T_B and common index space size n, to a +// output matrix of type T_C. The matrices are sparse, with different numbers +// of non-zeros and different sparsity patterns. +// ie. we want to produce C = A'*B in the sense of the given semi-ring. + +// This version uses a binary-search algorithm, when the sizes nnzA and nnzB +// are far apart in size, neither is very spare nor dense, for any size of N. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number +// of active threads = min( min(nzA, nzB), 32) + +// Thus, each t in threadblock b owns a part of the set of pairs in the +// sparse-sparse bucket of work. The job for each pair of vectors is to find +// the intersection of the index sets Ai and Bi, perform the semi-ring dot +// product on those items in the intersection, and finally +// on exit write it to Cx [pair]. + +// int64_t start <- start of vector pairs for this kernel +// int64_t end <- end of vector pairs for this kernel +// int64_t *Bucket <- array of pair indices for all kernels +// GrB_Matrix C <- result matrix +// GrB_Matrix M <- mask matrix +// GrB_Matrix A <- input matrix A +// GrB_Matrix B <- input matrix B + +#include +#include +#include +#include "mySemiRing.h" +#include "matrix.h" + +// Using tile size fixed at compile time, we don't need shared memory +#define tile_sz 32 + +using namespace cooperative_groups; + +template< typename T, int warpSize > +__device__ T reduce_sum(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + val += g.shfl_down(val,i) ; + } + return val; // note: only thread 0 will return full sum +} + +#define intersects_per_thread 8 + +template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> +__global__ void AxB_dot3_phase3_vssp +( + int64_t start, + int64_t end, + int64_t *Bucket, + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B, + int sz +) +{ + // Typed pointers to access data in A,B,C + T_A *Ax = (T_A*)A->x; + T_B *Bx = (T_B*)B->x; + T_C *Cx = (T_C*)C->x; + int64_t *Ci = C->i; + int64_t *Mi = M->i; + int64_t *Ai = A->i; + int64_t *Bi = B->i; + int64_t *Ap = A->p; + int64_t *Bp = B->p; + + // sz = expected non-zeros per dot + int m = 256/sz; + int nvecs = end - start; + int dpt = nvecs/(gridDim.x*32); + + int dots = (nvecs +dpt -1)/dpt; + + // zombie count + int zc = 0; + int64_t pair_id, im; + + // set thread ID + unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x; + unsigned int tid = threadIdx.x; + + unsigned long int b = blockIdx.x ; + + // Main loop over pairs + for (pair_id = start+ tid_global, im = 0; + pair_id < end && im < m; + pair_id += gridDim.x*blockDim.x, ++im){ + + int64_t i = Mi[pair_id]; + int64_t j = Ci[pair_id] >> 4; + + if( j < 0) //Pre-zombie + { + zc++; + continue; + } + + int64_t pA = Ap[i]; + int64_t pA_end = Ap[i+1]; + int64_t nnzA = pA_end - pA; + + int64_t pB = B->p[j]; + int64_t pB_end = B->p[j+1]; + int64_t nnzB = pB_end - pB; + + //Search for each nonzero in the smaller vector to find intersection + bool cij_exists = false; + + T_A aki; + T_B bkj; + T_Z cij; + + if (nnzA <= nnzB) { + //---------------------------------------------------------------------- + // A(:,i) is very sparse compared to B(:,j) + //---------------------------------------------------------------------- + + while (pA < pA_end && pB < pB_end) + { + int64_t ia = Ai [pA] ; + int64_t ib = Bi [pB] ; + if (ia < ib) + { + // A(ia,i) appears before B(ib,j) + pA++ ; + } + else if (ib < ia) + { + // B(ib,j) appears before A(ia,i) + // discard all entries B(ib:ia-1,j) + int64_t pleft = pB + 1 ; + int64_t pright = pB_end - 1 ; + GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ; + //ASSERT (pleft > pB) ; + pB = pleft ; + } + else // ia == ib == k + { + // A(k,i) and B(k,j) are the next entries to merge + #if defined ( GB_PHASE_1_OF_2 ) + cij_exists = true ; + break ; + #else + GB_DOT_MERGE ; + //GB_DOT_TERMINAL (cij) ; // break if cij == terminal + pA++ ; + pB++ ; + #endif + } + } + } + else { + //---------------------------------------------------------------------- + // B(:,j) is very sparse compared to A(:,i) + //---------------------------------------------------------------------- + + while (pA < pA_end && pB < pB_end) + { + int64_t ia = Ai [pA] ; + int64_t ib = Bi [pB] ; + if (ia < ib) + { + // A(ia,i) appears before B(ib,j) + // discard all entries A(ia:ib-1,i) + int64_t pleft = pA + 1 ; + int64_t pright = pA_end - 1 ; + GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ; + //ASSERT (pleft > pA) ; + pA = pleft ; + } + else if (ib < ia) + { + // B(ib,j) appears before A(ia,i) + pB++ ; + } + else // ia == ib == k + { + // A(k,i) and B(k,j) are the next entries to merge + #if defined ( GB_PHASE_1_OF_2 ) + cij_exists = true ; + break ; + #else + GB_DOT_MERGE ; + //GB_DOT_TERMINAL (cij) ; // break if cij == terminal + pA++ ; + pB++ ; + #endif + } + } + + } + if ( cij_exists){ + GB_PUTC ( Ci[pair_id]=i ) ; + GB_PUTC ( Cx[pair_id]=(T_C)cij ) ; + } + else { + zc++; + //printf(" %lld, %lld is zombie %d!\n",i,j,zc); + GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ; + } + + + } + + //-------------------------------------------------------------------------- + // reduce sum per-thread values to a single scalar + //-------------------------------------------------------------------------- + thread_block_tile tile = tiled_partition( this_thread_block()); + zc = reduce_sum(tile, zc); + + if( threadIdx.x ==0) { + //printf("warp %d zombie count = %d\n", blockIdx.x, zc); + atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc); + //printf(" Czombie = %lld\n",C->nzombies); + } + +} + diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit new file mode 100644 index 0000000000..d8d5f480b8 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit @@ -0,0 +1,230 @@ +const char* const templates_GB_jit_AxB_dot3_phase3_vssp_cu = "templates/GB_jit_AxB_dot3_phase3_vssp.cu\n" +"//------------------------------------------------------------------------------\n" +"// spGEMM_very_sparse_sparse.cu \n" +"//------------------------------------------------------------------------------\n" +"\n" +"// The spGEM_vssp CUDA kernel produces the semi-ring product of two\n" +"// sparse matrices of types T_A and T_B and common index space size n, to a \n" +"// output matrix of type T_C. The matrices are sparse, with different numbers\n" +"// of non-zeros and different sparsity patterns. \n" +"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n" +"\n" +"// This version uses a binary-search algorithm, when the sizes nnzA and nnzB\n" +"// are far apart in size, neither is very spare nor dense, for any size of N.\n" +"\n" +"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n" +"// threadblock, and the # of threadblocks is grid.x\n" +"\n" +"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n" +"// of active threads = min( min(nzA, nzB), 32) \n" +"\n" +"// Thus, each t in threadblock b owns a part of the set of pairs in the \n" +"// sparse-sparse bucket of work. The job for each pair of vectors is to find \n" +"// the intersection of the index sets Ai and Bi, perform the semi-ring dot \n" +"// product on those items in the intersection, and finally\n" +"// on exit write it to Cx [pair].\n" +"\n" +"// int64_t start <- start of vector pairs for this kernel\n" +"// int64_t end <- end of vector pairs for this kernel\n" +"// int64_t *Bucket <- array of pair indices for all kernels \n" +"// GrB_Matrix C <- result matrix \n" +"// GrB_Matrix M <- mask matrix\n" +"// GrB_Matrix A <- input matrix A\n" +"// GrB_Matrix B <- input matrix B\n" +"\n" +"#include \n" +"#include \n" +"#include \n" +"#include \"mySemiRing.h\"\n" +"#include \"matrix.h\"\n" +"\n" +"// Using tile size fixed at compile time, we don't need shared memory\n" +"#define tile_sz 32 \n" +"\n" +"using namespace cooperative_groups;\n" +"\n" +"template< typename T, int warpSize >\n" +"__device__ T reduce_sum(thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2)\n" +" {\n" +" val += g.shfl_down(val,i) ;\n" +" }\n" +" return val; // note: only thread 0 will return full sum\n" +"}\n" +"\n" +"#define intersects_per_thread 8\n" +"\n" +"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n" +"__global__ void AxB_dot3_phase3_vssp\n" +"(\n" +" int64_t start,\n" +" int64_t end,\n" +" int64_t *Bucket,\n" +" GrB_Matrix C,\n" +" GrB_Matrix M,\n" +" GrB_Matrix A,\n" +" GrB_Matrix B,\n" +" int sz\n" +")\n" +"{\n" +" // Typed pointers to access data in A,B,C\n" +" T_A *Ax = (T_A*)A->x;\n" +" T_B *Bx = (T_B*)B->x;\n" +" T_C *Cx = (T_C*)C->x;\n" +" int64_t *Ci = C->i;\n" +" int64_t *Mi = M->i;\n" +" int64_t *Ai = A->i;\n" +" int64_t *Bi = B->i;\n" +" int64_t *Ap = A->p;\n" +" int64_t *Bp = B->p;\n" +"\n" +" // sz = expected non-zeros per dot \n" +" int m = 256/sz;\n" +" int nvecs = end - start;\n" +" int dpt = nvecs/(gridDim.x*32);\n" +" \n" +" int dots = (nvecs +dpt -1)/dpt; \n" +"\n" +" // zombie count\n" +" int zc = 0;\n" +" int64_t pair_id, im;\n" +"\n" +" // set thread ID\n" +" unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n" +" unsigned int tid = threadIdx.x;\n" +"\n" +" unsigned long int b = blockIdx.x ;\n" +"\n" +" // Main loop over pairs \n" +" for (pair_id = start+ tid_global, im = 0; \n" +" pair_id < end && im < m; \n" +" pair_id += gridDim.x*blockDim.x, ++im){\n" +"\n" +" int64_t i = Mi[pair_id];\n" +" int64_t j = Ci[pair_id] >> 4;\n" +"\n" +" int64_t pA = Ap[i];\n" +" int64_t pA_end = Ap[i+1];\n" +" int64_t nnzA = pA_end - pA;\n" +"\n" +" int64_t pB = B->p[j]; \n" +" int64_t pB_end = B->p[j+1]; \n" +" int64_t nnzB = pB_end - pB;\n" +"\n" +" //Search for each nonzero in the smaller vector to find intersection \n" +" bool cij_exists = false;\n" +"\n" +" T_A aki;\n" +" T_B bkj;\n" +" T_Z cij;\n" +"\n" +" if (nnzA <= nnzB) {\n" +" //----------------------------------------------------------------------\n" +" // A(:,i) is very sparse compared to B(:,j)\n" +" //----------------------------------------------------------------------\n" +"\n" +" while (pA < pA_end && pB < pB_end)\n" +" {\n" +" int64_t ia = Ai [pA] ;\n" +" int64_t ib = Bi [pB] ;\n" +" if (ia < ib)\n" +" { \n" +" // A(ia,i) appears before B(ib,j)\n" +" pA++ ;\n" +" }\n" +" else if (ib < ia)\n" +" { \n" +" // B(ib,j) appears before A(ia,i)\n" +" // discard all entries B(ib:ia-1,j)\n" +" int64_t pleft = pB + 1 ;\n" +" int64_t pright = pB_end - 1 ;\n" +" GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;\n" +" //ASSERT (pleft > pB) ;\n" +" pB = pleft ;\n" +" }\n" +" else // ia == ib == k\n" +" { \n" +" // A(k,i) and B(k,j) are the next entries to merge\n" +" #if defined ( GB_PHASE_1_OF_2 )\n" +" cij_exists = true ;\n" +" break ;\n" +" #else\n" +" GB_DOT_MERGE ;\n" +" //GB_DOT_TERMINAL (cij) ; // break if cij == terminal\n" +" pA++ ;\n" +" pB++ ;\n" +" #endif\n" +" }\n" +" }\n" +" }\n" +" else {\n" +" //----------------------------------------------------------------------\n" +" // B(:,j) is very sparse compared to A(:,i)\n" +" //----------------------------------------------------------------------\n" +"\n" +" while (pA < pA_end && pB < pB_end)\n" +" {\n" +" int64_t ia = Ai [pA] ;\n" +" int64_t ib = Bi [pB] ;\n" +" if (ia < ib)\n" +" { \n" +" // A(ia,i) appears before B(ib,j)\n" +" // discard all entries A(ia:ib-1,i)\n" +" int64_t pleft = pA + 1 ;\n" +" int64_t pright = pA_end - 1 ;\n" +" GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;\n" +" //ASSERT (pleft > pA) ;\n" +" pA = pleft ;\n" +" }\n" +" else if (ib < ia)\n" +" { \n" +" // B(ib,j) appears before A(ia,i)\n" +" pB++ ;\n" +" }\n" +" else // ia == ib == k\n" +" { \n" +" // A(k,i) and B(k,j) are the next entries to merge\n" +" #if defined ( GB_PHASE_1_OF_2 )\n" +" cij_exists = true ;\n" +" break ;\n" +" #else\n" +" GB_DOT_MERGE ;\n" +" //GB_DOT_TERMINAL (cij) ; // break if cij == terminal\n" +" pA++ ;\n" +" pB++ ;\n" +" #endif\n" +" }\n" +" }\n" +"\n" +" }\n" +" if ( cij_exists){\n" +" GB_PUTC ( Ci[pair_id]=i ) ;\n" +" GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;\n" +" }\n" +" else {\n" +" zc++; \n" +" //printf(\" %lld, %lld is zombie %d!\\n\",i,j,zc);\n" +" GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;\n" +" }\n" +"\n" +"\n" +" }\n" +"\n" +" //--------------------------------------------------------------------------\n" +" // reduce sum per-thread values to a single scalar\n" +" //--------------------------------------------------------------------------\n" +" thread_block_tile tile = tiled_partition( this_thread_block());\n" +" zc = reduce_sum(tile, zc);\n" +"\n" +" if( threadIdx.x ==0) {\n" +" //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n" +" atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n" +" //printf(\" Czombie = %lld\\n\",C->zombie_count);\n" +" }\n" +"\n" +"}\n" +"\n" +; diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu new file mode 100644 index 0000000000..7482a86b3f --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu @@ -0,0 +1,208 @@ +//****************************************************************************** +// Sparse dot version of Matrix-Matrix multiply with mask +// Each thread in this kernel is responsible for m vector-pairs(x,y), +// finding intersections and producting the final dot product for each +// using a serial merge algorithm on the sparse vectors. +// m = 256/sz, where sz is in {4, 16, 64, 256} +// For a vector-pair, sz = xnz + ynz +// Template on +// Parameters: + +// int64_t start <- start of vector pairs for this kernel +// int64_t end <- end of vector pairs for this kernel +// int64_t *Bucket <- array of pair indices for all kernels +// matrix *C <- result matrix +// matrix *M <- mask matrix +// matrix *A <- input matrix A +// matrix *B <- input matrix B +// int sz <- nnz of very sparse vectors + +// Blocksize is 1024, uses warp and block reductions to count zombies produced. +//****************************************************************************** +#define GB_KERNEL +#include +#include +#include +#include +#include "matrix.h" +#include "mySemiRing.h" + +using namespace cooperative_groups; + +template< typename T, int tile_sz> +__inline__ __device__ +T warp_ReduceSumPlus( thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) { + //printf("thd%d %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold)); + val += g.shfl_down( val, i); + } + return val; // note: only thread 0 will return full sum +} + +template< typename T, int tile_sz> +__inline__ __device__ +T warp_Reduce( thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) { + T next = g.shfl_down( val, i) ; + val = GB_ADD( sum, next ) ; + } + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T block_ReduceSum(thread_block g, T val) +{ + static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x & 31 ; // % warpSize; + int wid = threadIdx.x >> 5 ; // / warpSize; + thread_block_tile tile = tiled_partition( g ); + + // Each warp performs partial reduction + val = warp_ReduceSumPlus( tile, val); + + // Wait for all partial reductions + if (lane==0) shared[wid]=val; // Write reduced value to shared memory + __syncthreads(); // Wait for all partial reductions + + if (wid > 0 || gridDim.x == 1 ) return val; + + //read from shared memory only if that warp existed + val = (threadIdx.x < (blockDim.x / warpSize ) ) ? shared[lane] : 0; + //printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val); + + if (wid==0) val = warp_ReduceSumPlus( tile, val); //Final reduce within first warp + + return val; +} + +template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> +__global__ void AxB_dot3_phase3_vsvs +( + const int64_t start, + const int64_t end, + const int64_t *__restrict__ Bucket, + const GrB_Matrix C, + const GrB_Matrix M, + const GrB_Matrix A, + const GrB_Matrix B, + const int sz +) +{ + int dots = end - start; + // sz = expected non-zeros per dot + /* + int m = (gridDim.x*blockDim.x)*256/sz; + int dpt = (nvecs+ gridDim.x*blockDim.x -1)/(gridDim.x*blockDim.x); + m = dpt < m ? dpt : m; + + int dots = (nvecs +m -1)/m; + */ + const T_A *__restrict__ Ax = (T_A *)A->x ; + const T_B *__restrict__ Bx = (T_B *)B->x ; + T_C *__restrict__ Cx = (T_C *)C->x ; + int64_t *__restrict__ Ci = C->i ; + const int64_t *__restrict__ Mi = M->i ; + const int64_t *__restrict__ Ai = A->i ; + const int64_t *__restrict__ Bi = B->i ; + const int64_t *__restrict__ Ap = A->p ; + const int64_t *__restrict__ Bp = B->p ; + + int pfirst, plast; + + GB_PARTITION (pfirst, plast, dots, blockIdx.x, gridDim.x ) ; + /* + if( threadIdx.x ==0 ) + { + printf("block%d %d dots/thrd, start,end = %ld,%ld pf,pl=%d,%d blockDim=%d\n", + blockIdx.x, (dots + blockDim.x*gridDim.x -1)/(blockDim.x*gridDim.x), + start, end, pfirst, plast, blockDim.x); + } + __syncthreads(); + */ + + + int zc = 0 ; + + int64_t pair_id; + + //for ( int tid= threadIdx.x +blockDim.x*blockIdx.x; + // tid < dots; + // tid += blockDim.x * gridDim.x) + for ( int tid = pfirst+ threadIdx.x ; + tid < plast; + tid += blockDim.x ) + { + + pair_id = Bucket[ start + tid ]; + + int64_t i = Mi [pair_id] ; + int64_t j = Ci [pair_id]>>4 ; + + int64_t pA = Ap[i] ; + int64_t pA_end = Ap[i+1] ; + int64_t pB = Bp[j] ; + int64_t pB_end = Bp[j+1] ; + + T_A aki; + T_B bkj; + T_Z cij ; + + bool cij_exists = false; + + while (pA < pA_end && pB < pB_end) + { + int64_t ia = Ai [pA] ; + int64_t ib = Bi [pB] ; + if( ia == ib) + { + // A(k,i) and B(k,j) are the next entries to merge + #if defined ( GB_PHASE_1_OF_2 ) + cij_exists = true ; + break ; + #else + GB_DOT_MERGE ; + //GB_DOT_TERMINAL (cij) ; // break if cij == terminal + pA++ ; + pB++ ; + #endif + } + else + { + // A(ia,i) appears before B(ib,j) + pA += ( ia < ib); + // B(ib,j) appears before A(ia,i) + pB += ( ib < ia); + } + } + if (cij_exists){ + GB_PUTC ( Ci[pair_id] = i ) ; + GB_PUTC ( Cx[pair_id] = (T_C)cij ) ; + } + else{ + //printf(" %lld, %lld is zombie %d!\n",i,j,zc); + zc++; + GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ; + } + } + + __syncthreads(); + + //printf("thd%d zombie count = %d\n",threadIdx.x,zc); + zc = block_ReduceSum( this_thread_block(), zc); + __syncthreads(); + + if( threadIdx.x == 0 && zc > 0) { + //printf("block%d zombie count = %d\n", blockIdx.x, zc); + atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc); + //C->nzombies += (unsigned long long int)zc; + //printf("blk:%d Czombie = %lld\n", blockIdx.x,C->nzombies); + } + +} diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit new file mode 100644 index 0000000000..6885b62420 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit @@ -0,0 +1,216 @@ +const char* const templates_GB_jit_AxB_dot3_phase3_vsvs_cu = "templates/GB_jit_AxB_dot3_phase3_vsvs.cu\n" +"//******************************************************************************\n" +"// Sparse dot version of Matrix-Matrix multiply with mask \n" +"// Each thread in this kernel is responsible for m vector-pairs(x,y), \n" +"// finding intersections and producting the final dot product for each\n" +"// using a serial merge algorithm on the sparse vectors. \n" +"// m = 256/sz, where sz is in {4, 16, 64, 256}\n" +"// For a vector-pair, sz = xnz + ynz \n" +"// Template on \n" +"// Parameters:\n" +"\n" +"// int64_t start <- start of vector pairs for this kernel\n" +"// int64_t end <- end of vector pairs for this kernel\n" +"// int64_t *Bucket <- array of pair indices for all kernels \n" +"// matrix *C <- result matrix \n" +"// matrix *M <- mask matrix\n" +"// matrix *A <- input matrix A\n" +"// matrix *B <- input matrix B\n" +"// int sz <- nnz of very sparse vectors\n" +"\n" +"// Blocksize is 1024, uses warp and block reductions to count zombies produced.\n" +"//******************************************************************************\n" +"#include \n" +"#include \n" +"#include \n" +"#include \n" +"//#include \"GB_matrix.h\"\n" +"#include \"matrix.h\"\n" +"#include \"mySemiRing.h\"\n" +"\n" +"using namespace cooperative_groups;\n" +"\n" +"template< typename T, int tile_sz>\n" +"__inline__ __device__ \n" +"T warp_ReduceSumPlus( thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2) {\n" +" //printf(\"thd%d %d OP %d is %d\\n\", threadIdx.x, val, fold, OP( val, fold));\n" +" val += g.shfl_down( val, i);\n" +" }\n" +" return val; // note: only thread 0 will return full sum\n" +"}\n" +"\n" +"template< typename T, int tile_sz>\n" +"__inline__ __device__ \n" +"T warp_Reduce( thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2) {\n" +" //printf(\"thd%d %d OP %d is %d\\n\", threadIdx.x, val, fold, OP( val, fold));\n" +" T next = g.shfl_down( val, i) ;\n" +" val = GB_ADD( sum, next ) ; \n" +" }\n" +" //if (threadIdx.x ==0) printf(\"thd%d single warp sum is %d\\n\", threadIdx.x, val);\n" +" return val; // note: only thread 0 will return full sum\n" +"}\n" +"\n" +"template\n" +"__inline__ __device__\n" +"T block_ReduceSum(thread_block g, T val)\n" +"{\n" +" static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums\n" +" int lane = threadIdx.x % warpSize;\n" +" int wid = threadIdx.x / warpSize;\n" +" thread_block_tile tile = tiled_partition( g );\n" +"\n" +" // Each warp performs partial reduction\n" +" val = warp_ReduceSumPlus( tile, val); \n" +"\n" +" // Wait for all partial reductions\n" +" if (lane==0) { \n" +" //printf(\"thd%d warp%d sum is %d\\n\", threadIdx.x, wid, val);\n" +" shared[wid]=val; // Write reduced value to shared memory\n" +" //printf(\"thd%d stored warp %d sum %d\\n\", threadIdx.x, wid, val);\n" +" }\n" +" tile.sync(); // Wait for all partial reductions\n" +"\n" +" if (wid > 0 || gridDim.x == 1 ) return val;\n" +" //read from shared memory only if that warp existed\n" +" val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;\n" +" //printf(\"thd%d warp loaded val = %d\\n\", threadIdx.x, lane, val);\n" +"\n" +" \n" +" if (wid==0) val = warp_ReduceSumPlus( tile, val); //Final reduce within first warp\n" +"\n" +" return val;\n" +"}\n" +"\n" +"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n" +"__global__ void AxB_dot3_phase3_vsvs\n" +"( \n" +" int64_t start, \n" +" int64_t end,\n" +" int64_t *Bucket, \n" +" GrB_Matrix C, \n" +" GrB_Matrix M, \n" +" GrB_Matrix A, \n" +" GrB_Matrix B,\n" +" int sz \n" +")\n" +"{\n" +" // sz = expected non-zeros per dot \n" +" int m = 256/sz;\n" +" int nvecs = end - start;\n" +" int dpt = nvecs/32;\n" +" m = dpt < m ? dpt : m;\n" +" //__shared__ int zombie_local[32];\n" +" /*\n" +" if( threadIdx.x ==0 && blockIdx.x == 0)\n" +" printf(\" %d dots/thrd, nvecs = %d blockDim=%d\\n\",sz, nvecs, blockDim.x);\n" +" __syncthreads();\n" +" */\n" +" int dots = (nvecs +m -1)/m; \n" +" int zc = 0;\n" +"\n" +" T_A *Ax = (T_A *)A->x ;\n" +" T_B *Bx = (T_B *)B->x ;\n" +" T_C *Cx = (T_C *)C->x ;\n" +" int64_t *Ci = C->i ;\n" +" int64_t *Mi = M->i ;\n" +" int64_t *Ai = A->i ;\n" +" int64_t *Bi = B->i ;\n" +" int64_t *Ap = A->p ;\n" +" int64_t *Bp = B->p ;\n" +" \n" +" for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;\n" +" tid < dots;\n" +" tid += blockDim.x * gridDim.x) {\n" +" int pair_id, im; \n" +" //if (threadIdx.x ==0)\n" +" // printf(\"thd%u pi=%lld\\n\",tid, start+threadIdx.x); \n" +" // __syncthreads();\n" +"\n" +" for (pair_id = start+tid, im = 0; \n" +" im < m && pair_id < end; \n" +" ++im, pair_id += dots ){\n" +"\n" +" int64_t i = Mi [pair_id] ;\n" +" int64_t j = Ci [pair_id]>>4 ; \n" +" //int64_t i = M->i[pair_id];\n" +" //int64_t j = C->i[pair_id] >> 4;\n" +" //if (threadIdx.x ==0)\n" +" // printf(\"thd%u i,j=%lld,%lld\\n\",tid, i,j); \n" +" // __syncthreads();\n" +" \n" +" // printf(\"thd%d pi=%d xn=%lld yn=%lld\\n\",tid, pair_id, \n" +" // A->p[i+1]- A->p[i],\n" +" // B->p[j+1]- B->p[j]);\n" +"\n" +" int64_t pA = Ap[i];\n" +" int64_t pA_end = Ap[i+1];\n" +" int64_t pB = Bp[j]; \n" +" int64_t pB_end = Bp[j+1]; \n" +"\n" +" T_A aki;\n" +" T_B bkj;\n" +" T_Z cij ;\n" +"\n" +" bool cij_exists = false;\n" +"\n" +" while (pA < pA_end && pB < pB_end)\n" +" {\n" +" int64_t ia = Ai [pA] ;\n" +" int64_t ib = Bi [pB] ;\n" +" if (ia < ib)\n" +" { \n" +" // A(ia,i) appears before B(ib,j)\n" +" pA++ ;\n" +" }\n" +" else if (ib < ia)\n" +" { \n" +" // B(ib,j) appears before A(ia,i)\n" +" pB++ ;\n" +" }\n" +" else // ia == ib == k\n" +" { \n" +" // A(k,i) and B(k,j) are the next entries to merge\n" +" #if defined ( GB_PHASE_1_OF_2 )\n" +" cij_exists = true ;\n" +" break ;\n" +" #else\n" +" GB_DOT_MERGE ;\n" +" //GB_DOT_TERMINAL (cij) ; // break if cij == terminal\n" +" pA++ ;\n" +" pB++ ;\n" +" #endif\n" +" }\n" +" }\n" +" if (cij_exists){\n" +" GB_PUTC ( Ci[pair_id] = i ) ;\n" +" GB_PUTC ( Cx[pair_id] = (T_C)cij ) ;\n" +" }\n" +" else{\n" +" zc++; \n" +" //printf(\" %lld, %lld is zombie %d!\\n\",i,j,zc);\n" +" GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;\n" +" }\n" +" }\n" +" \n" +" }\n" +" //printf(\"thd%d zombie count = %d\\n\",threadIdx.x,zc);\n" +" zc = block_ReduceSum( this_thread_block(), zc); \n" +" \n" +" __syncthreads();\n" +" if( threadIdx.x == 0 && zc > 0) {\n" +" //printf(\"block zombie count = %d\\n\",zc);\n" +" atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n" +" //C->zombie_count += (unsigned long long int)zc;\n" +" //printf(\"blk:%d Czombie = %lld\\n\", blockIdx.x,C->zombie_count);\n" +" }\n" +" \n" +"}\n" +; diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu new file mode 100644 index 0000000000..fff127a074 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu @@ -0,0 +1,388 @@ +//------------------------------------------------------------------------------ +// AxB_dot3_phase3_warpix.cu +//------------------------------------------------------------------------------ + +// This CUDA kernel produces the semi-ring product of two +// sparse matrices of types T_A and T_B and common index space size n, to a +// output matrix of type T_C. The matrices are sparse, with different numbers +// of non-zeros and different sparsity patterns. +// ie. we want to produce C = A'*B in the sense of the given semi-ring. + +// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are +// relatively close in size, neither is very spare nor dense, for any size of N. +// Handles arbitrary sparsity patterns with guaranteed load balance. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number +// of active threads = min( min(g_xnz, g_ynz), 32) + +// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. Its job +// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot +// product on those items in the intersection, and finally reduce this data to a scalar, +// on exit write it to g_odata [b]. + +// int64_t start <- start of vector pairs for this kernel +// int64_t end <- end of vector pairs for this kernel +// int64_t *Bucket <- array of pair indices for all kernels +// matrix *C <- result matrix +// matrix *M <- mask matrix +// matrix *A <- input matrix A +// matrix *B <- input matrix B +#define GB_KERNEL +#include +#include +#include "matrix.h" +#include +#include "mySemiRing.h" + +// Using tile size fixed at compile time, we don't need shared memory +#define tile_sz 32 + +using namespace cooperative_groups; + +template< typename T, int warp_sz> +__device__ __inline__ +T GB_reduce_sum(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + T next = g.shfl_down( val, i); + val = GB_ADD( val, next ) ; + } + return val; +} + +template< typename T, int warp_sz> +__device__ __inline__ +T reduce_plus(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + val += g.shfl_down( val, i) ; + } + return val; // note: only thread 0 will return full sum and flag value +} + +#define intersects_per_thread 8 + +template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> +__global__ void AxB_dot3_phase3_warpix +( + int64_t start, + int64_t end, + int64_t *__restrict__ Bucket, + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B, + int sz +) +{ + + T_A *__restrict__ Ax = (T_A*)A->x; + T_B *__restrict__ Bx = (T_B*)B->x; + T_C *__restrict__ Cx = (T_C*)C->x; + int64_t *__restrict__ Ci = C->i; + int64_t *__restrict__ Mi = M->i; + int64_t *__restrict__ Mp = M->p; + int64_t *__restrict__ Ai = A->i; + int64_t *__restrict__ Bi = B->i; + int64_t *__restrict__ Ap = A->p; + int64_t *__restrict__ Bp = B->p; + + int64_t mnvec = M->nvec; + + // zombie count + int zc; + + int64_t pair_id; + + // set thread ID + int tid_global = threadIdx.x+ blockDim.x* blockIdx.x; + int tid = threadIdx.x; + int b = blockIdx.x ; + + // total items to be inspected + int64_t nnzA = 0; + int64_t nnzB = 0; + + thread_block_tile tile = tiled_partition( this_thread_block()); + + //int parts = gridDim.x; //Each warp is a part + + //Find our part of the work bucket + int64_t pfirst, plast, kfirst, klast ; + GB_PARTITION (pfirst, plast, end-start, b, gridDim.x ) ; + /* + if( tid ==0 ) { + printf("block%d is alive, pf,pl=%ld,%ld \n", b, pfirst, plast); + } + __syncthreads(); + */ + + + __shared__ int64_t As[256]; + __shared__ int64_t Bs[256]; + __shared__ T_A Axs[256]; + __shared__ T_B Bxs[256]; + + /* + int Bpl[9]; // local offsets into shared for multiple vectors of B + int shr_vec[8] ; //columns of B we see in this task + + pair_id = Bucket[pfirst]; + int64_t i = Mi[pair_id] ; + int vecs = 1 ; + int last_vec = i; + shr_vec[0] = i; + for (int id =1; id< plast-pfirst; id++) + { + pair_id = Bucket[pfirst+id]; + i = Mi[pair_id]; + if (i == last_vec) continue; + vecs++; + shr_vec[vecs] = i; + last_vec = i; + } + int all_loaded = 0; + + Bpl[0] = 0; + for ( int k = 0; k < vecs; k++) + { + int64_t pA = Ap[ shr_vec[k] ]; + int64_t pA_end = Ap[ shr_vec[k] +1]; + nnzA = pA_end - pA; + Bpl[k+1] = Bpl[k] + nnzA; + for (int i = tid ; i < nnzA; i+= blockDim.x) + { + As[ Bpl[k] +i ] = Ai[ pA + i ] ; + } + __syncthreads(); + } + + //pre-load columns of B, which will be reused, to shared memory + //Due to loading a contigious block with stride 1 this is fast + + all_loaded = (Bpl[vecs] < 256 ); + if( tid == 0 ) { + printf("block%d loaded %d vals from B, vecs=%d, all_loaded=%d\n", + b, Bpl[vecs], vecs, all_loaded ); + } + __syncthreads(); + + + // reset counter + */ + // Main loop over pairs + for (int id = start + pfirst; // loop on pairs + id < start+ plast; + id ++ ) + { + int64_t pair_id = Bucket[id]; + + int64_t i = Mi[pair_id]; + int64_t j = Ci[pair_id] >> 4; + + int64_t pA = Ap[i]; + int64_t pA_end = Ap[i+1]; + nnzA = pA_end - pA; + + int64_t pB = Bp[j]; + int64_t pB_end = Bp[j+1]; + nnzB = pB_end - pB; + + zc = 0 ; + int j_last = -1 ; + + + // No search, this warp does all the work + + int tx_start = pA; + int tx_end = pA_end; + int ty_start = pB; + int ty_end = pB_end; + + for ( int i = tid; i < nnzA ; i+= blockDim.x) + { + As [i] = Ai[ pA + i]; + Axs[i] = Ax[ pA + i]; + } + __syncthreads(); + + if ( j != j_last) { + for ( int i = tid; i < nnzB ; i+= blockDim.x) + { + Bs [i] = Bi[ pB + i]; + Bxs[i] = Bx[ pB + i]; + } + __syncthreads(); + j_last = j; + } + + + /* + if ( tid==0 ) { + //printf("block %d dot %lld i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j); + printf("block%d dot %ld(i,j)=(%ld,%ld) xs,xe= %d,%d ys,ye = %d,%d \n", + b, pair_id, i, j, tx_start,tx_end, ty_start, ty_end); + //for(int a = 0; a < nnzA; a++) printf(" As[%d]:%ld ",a, As[j]); + } + tile.sync(); + */ + + + + // Warp intersection: balanced by design, no idle threads. + // Each 32 thread warp will handle 32 comparisons per loop. + // Either A or B takes stride 4, other takes stride 8 + // For this version A strides 4, B strides 8 + T_A aki; + T_B bkj; + T_Z cij = GB_IDENTITY ; + int Astride = nnzA > nnzB ? 8 : 4; + int Ashift = nnzA > nnzB ? 3 : 2; + int Amask = nnzA > nnzB ? 7 : 3; + int Bstride = nnzB >= nnzA ? 8 : 4; + //printf(" Astride = %d, Bstride = %d\n", Astride, Bstride); + + // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists. + // just check if cij > 0 + + int cij_exists = 0 ; + + //Warp intersection dot product + int bitty_row = tid & Amask ; + int bitty_col = tid >> Ashift ; + + int k = tx_start + bitty_row ; + int l = ty_start + bitty_col ; + + //Ai[k] = As[ k -pA ]; for lookup + //Bi[l] = Bs[ l -pB ]; + + + int inc_k,inc_l; + + int active = ( ( k < tx_end) && (l < ty_end ) ); + + /* + printf("block%d tid%d Ai,As=%ld,%ld Bi,Bs=%ld,%ld k,l =%d,%d active:%d\n", + b,tid, Ai[k], As[k -pA], Bi[l], Bs[l -pB], + k, l, active ); + */ + + + while ( tile.any(active) ) + { + inc_k = 0; + inc_l = 0; + int kp = k-pA; + int lp = l-pB; + if ( active ) + { + coalesced_group g = coalesced_threads(); + if ( g.thread_rank() == g.size()-1) + { + inc_k = ( As[kp] <= Bs[lp] ) ; + inc_l = ( Bs[lp] <= As[kp] ) ; + // printf("block%d tid%d inc_k= %d inc_l = %d\n",b, tid, inc_k, inc_l ); + } + //tile.sync(); + + if ( As [kp] == Bs [lp] ) + { + //Axs[kp] = Ax[k]; + //Bxs[lp] = Bx[l]; + + GB_GETA ( aki=(T_Z)Axs[kp] ) ; + GB_GETB ( bkj=(T_Z)Bxs[lp] ) ; + if (cij_exists) + { + T_Z t = GB_MULT( (T_Z) aki, (T_Z) bkj); + GB_ADD_F( cij, t ) ; + //printf("block%d thd%d ix at %ld(%ld) cij += %d * %d\n",b, tid, Ai[k], As[kp], aki, bkj); + } + else + { + cij_exists = 1 ; + cij = GB_MULT ( (T_Z) aki, (T_Z) bkj) ; + //printf(" thd%d ix at %ld(%ld) cij = %d * %d \n", tid, Ai[k], Ais[kp], aki, bkj); + } + } + // TODO check terminal condition + //printf(" block%u work value = %d, exists = %d\n", b, cij, cij_exists); + //printf("block%d tid%d k,l = %d,%d Ai,Bi = %ld,%ld \n", b, tid, k, l, Ai[k], Bi[l] ); + } + //tile.sync(); + //inc_k = tile.shfl_down( inc_k, 31-tid); + if( tile.any(inc_k) ) { + k =1+ tile.shfl_down(k,31-tid) + bitty_row ; // tid%Astride; + //Ais [k-pA] = As[k-pA]; + //Axs [bitty_row] = Ax[k]; + } + if( tile.any(inc_l) ) { + l =1+ tile.shfl_down(l,31-tid) + bitty_col ; // tid/Astride; + //Bis [l-pB] = Bs[l-pB]; + //Bxs [bitty_col] = Bx[l]; + } + active = ( ( k < tx_end) && (l < ty_end ) ); + //printf("block%d tid = %d k = %d l= %d active=%d\n", b, tid, k, l,active); + } + tile.sync(); + + //-------------------------------------------------------------------------- + // reduce sum per-thread values to a single scalar, get OR of flag + //-------------------------------------------------------------------------- + + // Do vote here for control. + cij_exists = tile.any( cij_exists); + tile.sync(); + + if (cij_exists) + { + cij = GB_reduce_sum( tile, cij ); + } + tile.sync(); + + + // Atomic write result for this block to global mem + if (tid == 0) + { + //printf ("final %d : %d exists = %d\n", b, cij, cij_exists) ; + if (cij_exists) + { + //printf("block%d i,j =%ld,%ld cij = %d\n",b, i, j, cij); + GB_PUTC( Cx[pair_id] = (T_C) cij ) ; + GB_PUTC ( Ci[pair_id] = i ) ; + + } + else + { + //printf(" dot %d is a zombie\n", pair_id); + zc++; + GB_PUTC ( Ci[pair_id] = GB_FLIP (i) ) ; + } + + //__syncthreads(); + + + if( zc > 0) + { + //printf("warp %d zombie count = %d\n", blockIdx.x, zc); + atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc); + //printf("blk:%d Czombie = %lld\n",blockIdx.x,C->zombies); + } + + } + tile.sync(); + /* + */ + } +} + diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit new file mode 100644 index 0000000000..96a938a7c1 --- /dev/null +++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit @@ -0,0 +1,356 @@ +const char* const templates_GB_jit_AxB_dot3_phase3_warpix_cu = "templates/GB_jit_AxB_dot3_phase3_warpix.cu\n" +"//------------------------------------------------------------------------------\n" +"// AxB_dot3_phase3_warpix.cu \n" +"//------------------------------------------------------------------------------\n" +"\n" +"// This CUDA kernel produces the semi-ring product of two\n" +"// sparse matrices of types T_A and T_B and common index space size n, to a \n" +"// output matrix of type T_C. The matrices are sparse, with different numbers\n" +"// of non-zeros and different sparsity patterns. \n" +"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n" +"\n" +"// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are \n" +"// relatively close in size, neither is very spare nor dense, for any size of N.\n" +"// Handles arbitrary sparsity patterns with guaranteed load balance.\n" +"\n" +"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n" +"// threadblock, and the # of threadblocks is grid.x\n" +"\n" +"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n" +"// of active threads = min( min(g_xnz, g_ynz), 32) \n" +"\n" +"// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. Its job\n" +"// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot\n" +"// product on those items in the intersection, and finally reduce this data to a scalar, \n" +"// on exit write it to g_odata [b].\n" +"\n" +"// int64_t start <- start of vector pairs for this kernel\n" +"// int64_t end <- end of vector pairs for this kernel\n" +"// int64_t *Bucket <- array of pair indices for all kernels \n" +"// matrix *C <- result matrix \n" +"// matrix *M <- mask matrix\n" +"// matrix *A <- input matrix A\n" +"// matrix *B <- input matrix B\n" +"#include \n" +"#include \n" +"#include \n" +"#include \"mySemiRing.h\"\n" +"#include \"matrix.h\"\n" +"\n" +"// Using tile size fixed at compile time, we don't need shared memory\n" +"#define tile_sz 32 \n" +"\n" +"using namespace cooperative_groups;\n" +"\n" +"template< typename T, int warp_sz>\n" +"__device__ __inline__ \n" +"T GB_reduce_sum(thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2)\n" +" {\n" +" T next = g.shfl_down( val, i);\n" +" val = GB_ADD( val, next ) ;\n" +" }\n" +" return val;\n" +"}\n" +"\n" +"template< typename T, int warp_sz>\n" +"__device__ __inline__ \n" +"T reduce_plus(thread_block_tile g, T val)\n" +"{\n" +" // Each iteration halves the number of active threads\n" +" // Each thread adds its partial sum[i] to sum[lane+i]\n" +" for (int i = g.size() / 2; i > 0; i /= 2)\n" +" {\n" +" val += g.shfl_down( val, i) ;\n" +" }\n" +" return val; // note: only thread 0 will return full sum and flag value\n" +"}\n" +"\n" +"#define intersects_per_thread 8\n" +"\n" +"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z> \n" +"__global__ void AxB_dot3_phase3_warp\n" +"(\n" +" int64_t start,\n" +" int64_t end,\n" +" int64_t *Bucket,\n" +" GrB_Matrix C,\n" +" GrB_Matrix M,\n" +" GrB_Matrix A,\n" +" GrB_Matrix B,\n" +" int sz\n" +")\n" +"{\n" +"\n" +" T_A *Ax = (T_A*)A->x;\n" +" T_B *Bx = (T_B*)B->x;\n" +" T_C *Cx = (T_C*)C->x;\n" +" int64_t *Ci = C->i;\n" +" int64_t *Mi = M->i;\n" +" int64_t *Ai = A->i;\n" +" int64_t *Bi = B->i;\n" +" int64_t *Ap = A->p;\n" +" int64_t *Bp = B->p;\n" +"\n" +"\n" +" // zombie count\n" +" int zc = 0;\n" +"\n" +" int64_t pair_id;\n" +"\n" +" // set thread ID\n" +" int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n" +" int tid = threadIdx.x;\n" +"\n" +" int b = blockIdx.x ;\n" +"\n" +" // total items to be inspected\n" +" int64_t nnzA = 0;\n" +" int64_t nnzB = 0;\n" +" int64_t n_intersect = 0;\n" +"\n" +" thread_block_tile tile = tiled_partition( this_thread_block());\n" +"\n" +" //int parts = gridDim.x; //Each warp is a part\n" +"\n" +" // Main loop over pairs \n" +" for (pair_id = start +blockIdx.x; // 1 warp per pair \n" +" pair_id < end; \n" +" pair_id += gridDim.x )\n" +" {\n" +"\n" +" int64_t i = Mi[pair_id];\n" +" int64_t j = Ci[pair_id] >> 4;\n" +"\n" +" int64_t pA = Ap[i];\n" +" int64_t pA_end = Ap[i+1];\n" +" nnzA = pA_end - pA;\n" +"\n" +" int64_t pB = Bp[j]; \n" +" int64_t pB_end = Bp[j+1]; \n" +" nnzB = pB_end - pB;\n" +"\n" +" n_intersect = INTMIN( nnzA, nnzB); \n" +"\n" +" /* only for multi-warp version\n" +" if( tid ==0 ) {\n" +" C->zombie_count = end - start;\n" +" Ci[pair_id] = GB_FLIP(i); // zombie until proven otherwise\n" +" printf(\"block %d flipped indices for %lld %lld\\n\", b, i, j);\n" +" }\n" +" tile.sync(); \n" +" */\n" +"\n" +" int64_t nxy = nnzA + nnzB;\n" +" /*\n" +" int work_per_warp = (nxy +parts -1)/parts;\n" +" int diag = INTMIN( work_per_warp*blockIdx.x, nxy);\n" +" int diag_end = INTMIN( diag + work_per_warp, nxy);\n" +" //printf(\" thd%d parts = %u wpt = %u diag, diag_end = %u,%u\\n\",tid, parts, work_per_warp, diag, diag_end); \n" +"\n" +" int x_min = INTMAX( (int)(diag - nnzB), 0);\n" +" int x_max = INTMIN( diag, nnzA);\n" +"\n" +" //printf(\"start thd%u x_min = %u x_max = %u\\n\", tid_global, x_min,x_max);\n" +" while ( x_min < x_max) { //binary search for correct diag break\n" +" int pivot = (x_min +x_max)/2;\n" +" if ( Ai[pivot + pA] < Bi[ diag -pivot -1 + pB]) {\n" +" x_min = pivot +1;\n" +" }\n" +" else {\n" +" x_max = pivot;\n" +" }\n" +" }\n" +" int xcoord = x_min;\n" +" int ycoord = diag -x_min -1;\n" +" if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+pA] == Bi[ycoord+pB]) ) { \n" +" diag--; //adjust for intersection incrementing both pointers \n" +" }\n" +" // two start points are known now\n" +" int tx_start = xcoord +pA;\n" +" int ty_start = diag -xcoord +pB; \n" +"\n" +" //if (x_start != y_start)\n" +" // printf(\"start thd%u xs,ys = %i,%i\\n\", tid_global, x_start, y_start);\n" +"\n" +" x_min = INTMAX( (int)(diag_end - nnzB), 0);\n" +" x_max = INTMIN( diag_end, nnzA);\n" +"\n" +" while ( x_min < x_max) {\n" +" int pivot = (x_min +x_max)/2;\n" +" //printf(\"thd%u pre_sw piv=%u diag_e = %u xmin,xmax=%u,%u\\n\", tid_global, pivot, diag_end,x_min, x_max);\n" +" if ( Ai[pivot+ pA] < Bi[ diag_end -pivot -1 +pB]) {\n" +" x_min = pivot +1;\n" +" }\n" +" else {\n" +" x_max = pivot;\n" +" }\n" +" //printf(\"thd%u piv=%u xmin,xmax = %u,%u\\n\", tid_global, pivot, x_min, x_max);\n" +" }\n" +" xcoord = x_min;\n" +" ycoord = diag_end -x_min -1;\n" +" if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +pA] == Bi[ycoord +pB]) ) { \n" +" diag--; //adjust for intersection incrementing both pointers \n" +" }\n" +" // two end points are known now\n" +" int tx_end = xcoord +pA; \n" +" int ty_end = diag_end - xcoord + pB; \n" +"\n" +" */ \n" +"\n" +" // No search, this warp does all the work\n" +" int tx_start = pA;\n" +" int tx_end = pA_end;\n" +" int ty_start = pB;\n" +" int ty_end = pB_end;\n" +" /*\n" +" if (threadIdx.x ==0 ) {\n" +" printf(\"block %d dot %lld i,j= %lld,%lld\\n\", blockIdx.x, pair_id, i, j);\n" +" printf(\"block %d dot %lld xs,xe= %d,%d ys,ye = %d,%d\\n\", \n" +" blockIdx.x, pair_id, tx_start,tx_end, ty_start, ty_end);\n" +" }\n" +" tile.sync();\n" +" */\n" +"\n" +" // Warp intersection: balanced by design, no idle threads. \n" +" // Each 32 thread warp will handle 32 comparisons per loop.\n" +" // Either A or B takes stride 4, other takes stride 8\n" +" // For this version A strides 4, B strides 8\n" +" T_A aki;\n" +" T_B bkj;\n" +" T_Z cij = MONOID_IDENTITY ;\n" +" int Astride = nnzA > nnzB ? 8 : 4;\n" +" int Ashift = nnzA > nnzB ? 3 : 2;\n" +" int Amask = nnzA > nnzB ? 7 : 3;\n" +" int Bstride = nnzB >= nnzA ? 8 : 4;\n" +" //printf(\" Astride = %d, Bstride = %d\\n\", Astride, Bstride);\n" +"\n" +" // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.\n" +" // just check if cij > 0\n" +"\n" +" int cij_exists = 0 ;\n" +"\n" +" //Warp intersection dot product\n" +" int bitty_row = tid & Amask ;\n" +" int bitty_col = tid >> Ashift ;\n" +"\n" +" int k = tx_start + bitty_row ;\n" +" int l = ty_start + bitty_col ;\n" +" int inc_k, inc_l;\n" +"\n" +" // int last_k = tx_start + 31%Astride ;\n" +" // int last_l = ty_start + 31/Astride ;\n" +"\n" +" //printf(\" thd%u has init value %d, k,l =%d, %d\\n\",tid, cij, k, l );\n" +" while ( k < tx_end && l < ty_end )\n" +" {\n" +" if (Ai [k] == Bi [l])\n" +" {\n" +" GB_GETA ( aki=(T_Z)Ax[k] ) ;\n" +" GB_GETB ( bkj=(T_Z)Bx[l] ) ;\n" +" if (cij_exists)\n" +" {\n" +" T_Z t = GB_MULT( (T_Z) aki, (T_Z) bkj);\n" +" cij = GB_ADD (cij, t ) ;\n" +" //printf(\" thd%d ix at %lld cij += %d * %d \\n\", tid_global, Ai[k], aki, bkj);\n" +" }\n" +" else\n" +" {\n" +" cij_exists = 1 ;\n" +" cij = GB_MULT ( (T_Z) aki, (T_Z) bkj) ;\n" +" //printf(\" thd%d ix at %lld cij = %d * %d \\n\", tid_global, Ai[k], Ax[k], Bx[l]);\n" +" }\n" +" // TODO check terminal condition\n" +" //printf(\" block%u work value = %d, exists = %d\\n\", b, cij, cij_exists);\n" +" }\n" +" if( tid == 31) // Last thread in the warp has the highest index.\n" +" {\n" +" inc_k = ( Ai[k] < Bi[l]);\n" +" inc_l = ( Ai[k] >= Bi[l]);\n" +" k += inc_k ; // * Astride ; \n" +" l += inc_l ; // * Bstride ; \n" +" // last_k += inc_k * Astride ; \n" +" // last_l += inc_l * Bstride ; \n" +" //printf(\"block%d k = %d l= %d, Ai,Bi = %lld,%lld\\n\", b, k, l, Ai[k], Bi[l]);\n" +" }\n" +" tile.sync();\n" +" inc_k = tile.shfl_down( inc_k, 31-tid);\n" +" if( inc_k) {\n" +" k = tile.shfl_down(k,31-tid) + bitty_row ; // tid%Astride;\n" +" }\n" +" else {\n" +" l = tile.shfl_down(l,31-tid) + bitty_col ; // tid/Astride;\n" +" }\n" +" //printf(\"block%d tid = %d k = %d l= %d\\n\", b, tid, k, l);\n" +" }\n" +"\n" +" //--------------------------------------------------------------------------\n" +" // reduce sum per-thread values to a single scalar, get OR of flag\n" +" //--------------------------------------------------------------------------\n" +"\n" +" // Do vote here for control.\n" +" cij_exists = tile.any( cij_exists);\n" +" tile.sync();\n" +"\n" +" if (cij_exists)\n" +" {\n" +" cij = GB_reduce_sum( tile, cij );\n" +" }\n" +" tile.sync();\n" +" \n" +"\n" +" // Atomic write result for this block to global mem\n" +" if (tid == 0)\n" +" {\n" +" //printf (\"final %d : %d exists = %d\\n\", b, cij, cij_exists) ;\n" +" if (cij_exists)\n" +" {\n" +" //printf(\" cij = %d\\n\", cij);\n" +" //T_C old = atomicCAS( (T_C *)&(Cx[pair_id]), MONOID_IDENTITY, (T_C) cij ) ;\n" +" //T_C assumed;\n" +" GB_PUTC( Cx[pair_id] = (T_C) cij ) ;\n" +" // Need ATOMIC_ADD here, use CAS on semi-ring operation \n" +" //if ( old != MONOID_IDENTITY) {\n" +" // do {\n" +" // assumed = old;\n" +" // old = atomicCAS( (T_C *)&(Cx[pair_id]), \n" +" // assumed, \n" +" // (T_C)( ADD( assumed, cij ) ) );\n" +" // }\n" +" // while ( assumed != old);\n" +" \n" +" //} \n" +" //unsigned long long int old_i;\n" +" //old_i = atomicCAS((unsigned long long int *)&(Ci[pair_id]),\n" +" // (unsigned long long int) GB_FLIP(i), i ) ; \n" +" //if ( old_i == GB_FLIP(i) ) {\n" +" // zc -= 1; //decrement zombies, this one is alive\n" +" // atomicAdd( (unsigned long long int*)&(C->zombie_count), \n" +" // (unsigned long long int)zc);\n" +" // }\n" +" GB_PUTC ( Ci[pair_id] = i ) ;\n" +" \n" +" }\n" +" else\n" +" {\n" +" //printf(\" dot %d is a zombie\\n\", pair_id);\n" +" zc++;\n" +" GB_PUTC ( Ci[pair_id] = GB_FLIP (i) ) ;\n" +" }\n" +" }\n" +" //__syncthreads(); \n" +" }\n" +"\n" +" if( tid ==0 && zc > 0)\n" +" {\n" +" //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n" +" atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n" +" //printf(\"blk:%d Czombie = %lld\\n\",blockIdx.x,C->zombie_count);\n" +" }\n" +"\n" +"}\n" +"\n" +; diff --git a/GraphBLAS/CUDA/templates/cooperative_groups.h b/GraphBLAS/CUDA/templates/cooperative_groups.h new file mode 100755 index 0000000000..1f296729e5 --- /dev/null +++ b/GraphBLAS/CUDA/templates/cooperative_groups.h @@ -0,0 +1,996 @@ +/* + * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_H_ +# define _COOPERATIVE_GROUPS_H_ + +#if defined(__cplusplus) && defined(__CUDACC__) + +# include "cooperative_groups_helpers.h" + +_CG_BEGIN_NAMESPACE + +/** + * class thread_group; + * + * Generic thread group type, into which all groups are convertible. + * It acts as a container for all storage necessary for the derived groups, + * and will dispatch the API calls to the correct derived group. This means + * that all derived groups must implement the same interface as thread_group. + */ +class thread_group +{ + friend _CG_QUALIFIER thread_group this_thread(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend class thread_block; + + protected: + union __align__(8) { + unsigned int type : 8; + struct { + unsigned int type : 8; + unsigned int size : 24; + unsigned int mask; + } coalesced; + struct { + void* ptr[2]; + } buffer; + } _data; + + _CG_QUALIFIER thread_group operator=(const thread_group& src); + _CG_QUALIFIER thread_group(__internal::groupType type) { + _data.type = type; + } + +#if __cplusplus >= 201103L + static_assert(sizeof(_data) == 16, "Failed size check"); +#endif + +public: + _CG_QUALIFIER unsigned int size() const; + _CG_QUALIFIER unsigned int thread_rank() const; + _CG_QUALIFIER void sync() const; +}; + +/** + * thread_group this_thread() + * + * Constructs a generic thread_group containing only the calling thread + */ +_CG_QUALIFIER thread_group this_thread() +{ + thread_group g = thread_group(__internal::Coalesced); + g._data.coalesced.mask = __internal::lanemask32_eq(); + g._data.coalesced.size = 1; + return (g); +} + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + +/** + * class multi_grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same system, on multiple devices within the same launched kernels. + * To use this group, the kernel must have been launched with + * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_multi_grid(); + */ +class multi_grid_group +{ + friend _CG_QUALIFIER multi_grid_group this_multi_grid(); + + struct __align__(8) { + unsigned long long handle; + unsigned int size; + unsigned int rank; + } _data; + +#if __cplusplus >= 201103L + static_assert(sizeof(_data) == 16, "Failed size check"); +#endif + +public: + _CG_QUALIFIER multi_grid_group() { + _data.handle = __internal::multi_grid::get_intrinsic_handle(); + _data.size = __internal::multi_grid::size(_data.handle); + _data.rank = __internal::multi_grid::thread_rank(_data.handle); + } + + _CG_QUALIFIER bool is_valid() const { + return (_data.handle != 0); + } + + _CG_QUALIFIER void sync() const { + _CG_ASSERT(is_valid()); + __internal::multi_grid::sync(_data.handle); + } + + _CG_QUALIFIER unsigned int size() const { + _CG_ASSERT(is_valid()); + return (_data.size); + } + + _CG_QUALIFIER unsigned int thread_rank() const { + _CG_ASSERT(is_valid()); + return (_data.rank); + } + + _CG_QUALIFIER unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (__internal::multi_grid::grid_rank(_data.handle)); + } + + _CG_QUALIFIER unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (__internal::multi_grid::num_grids(_data.handle)); + } +}; + +/** + * multi_grid_group this_multi_grid() + * + * Constructs a multi_grid_group + */ +_CG_QUALIFIER multi_grid_group this_multi_grid() +{ + return (multi_grid_group()); +} + +#endif + +#if defined(_CG_HAS_GRID_GROUP) + +/** + * class grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same device within the same launched kernel. To use this group, the kernel + * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_grid(); + */ +class grid_group +{ + friend _CG_QUALIFIER grid_group this_grid(); + + struct __align__(8) { + unsigned long long handle; + unsigned int size; + unsigned int rank; + } _data; + +#if __cplusplus >= 201103L + static_assert(sizeof(_data) == 16, "Failed size check"); +#endif + + public: + _CG_QUALIFIER grid_group() { + _data.handle = (__internal::grid::get_intrinsic_handle()); + _data.size = __internal::grid::size(_data.handle); + _data.rank = __internal::grid::thread_rank(_data.handle); + } + + _CG_QUALIFIER bool is_valid() const { + return (_data.handle != 0); + } + + _CG_QUALIFIER void sync() const { + _CG_ASSERT(is_valid()); + __internal::grid::sync(_data.handle); + } + + _CG_QUALIFIER unsigned int size() const { + _CG_ASSERT(is_valid()); + return (_data.size); + } + + _CG_QUALIFIER unsigned int thread_rank() const { + _CG_ASSERT(is_valid()); + return (_data.rank); + } + + _CG_QUALIFIER dim3 group_dim() const { + _CG_ASSERT(is_valid()); + return (__internal::grid::grid_dim()); + } + +}; + +/** + * grid_group this_grid() + * + * Constructs a grid_group + */ +_CG_QUALIFIER grid_group this_grid() +{ + return (grid_group()); +} + +#endif + +/** + * class thread_block + * + * Every GPU kernel is executed by a grid of thread blocks, and threads within + * each block are guaranteed to reside on the same streaming multiprocessor. + * A thread_block represents a thread block whose dimensions are not known until runtime. + * + * Constructed via this_thread_block(); + */ +class thread_block : public thread_group +{ + friend _CG_QUALIFIER thread_block this_thread_block(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); + + _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) { + } + + // Internal Use + _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + __internal::abort(); + return (thread_block()); + } + + unsigned int mask; + unsigned int base_offset = thread_rank() & (~(tilesz - 1)); + unsigned int masklength = min(size() - base_offset, tilesz); + + mask = (unsigned int)(-1) >> (32 - masklength); + mask <<= (__internal::laneid() & ~(tilesz - 1)); + thread_group tile = thread_group(__internal::CoalescedTile); + tile._data.coalesced.mask = mask; + tile._data.coalesced.size = __popc(mask); + return (tile); + } + + public: + _CG_QUALIFIER void sync() const { + __internal::cta::sync(); + } + + _CG_QUALIFIER unsigned int size() const { + return (__internal::cta::size()); + } + + _CG_QUALIFIER unsigned int thread_rank() const { + return (__internal::cta::thread_rank()); + } + + // Additional functionality exposed by the group + _CG_QUALIFIER dim3 group_index() const { + return (__internal::cta::group_index()); + } + + _CG_QUALIFIER dim3 thread_index() const { + return (__internal::cta::thread_index()); + } + + _CG_QUALIFIER dim3 group_dim() const { + return (__internal::cta::block_dim()); + } + +}; + +/** + * thread_block this_thread_block() + * + * Constructs a thread_block group + */ +_CG_QUALIFIER thread_block this_thread_block() +{ + return (thread_block()); +} + +/** + * class coalesced_group + * + * A group representing the current set of converged threads in a warp. + * The size of the group is not guaranteed and it may return a group of + * only one thread (itself). + * + * This group exposes warp-synchronous builtins. + * Constructed via coalesced_threads(); + */ +class coalesced_group : public thread_group +{ + friend _CG_QUALIFIER coalesced_group coalesced_threads(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz); + + _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const { + unsigned int member_pack = 0; + unsigned int member_rank = 0; + for (int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (laneMask & lane_bit) + member_pack |= 1 << member_rank; + member_rank++; + } + } + return (member_pack); + } + + // Internal Use + _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + __internal::abort(); + return (coalesced_group(0)); + } + if (size() <= tilesz) { + return (*this); + } + + if ((_data.type == __internal::CoalescedTile) && pow2_tilesz) { + unsigned int base_offset = (thread_rank() & (~(tilesz - 1))); + unsigned int masklength = min(size() - base_offset, tilesz); + unsigned int mask = (unsigned int)(-1) >> (32 - masklength); + + mask <<= (__internal::laneid() & ~(tilesz - 1)); + coalesced_group coalesced_tile = coalesced_group(mask); + coalesced_tile._data.type = __internal::CoalescedTile; + return (coalesced_tile); + } + else if ((_data.type == __internal::Coalesced) && pow2_tilesz) { + unsigned int mask = 0; + unsigned int member_rank = 0; + int seen_lanes = (thread_rank() / tilesz) * tilesz; + for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (seen_lanes <= 0 && member_rank < tilesz) { + mask |= lane_bit; + member_rank++; + } + seen_lanes--; + } + } + return (coalesced_group(mask)); + } + else { + // None in _CG_VERSION 1000 + __internal::abort(); + } + + return (coalesced_group(0)); + } + + protected: + // Construct a group from scratch (coalesced_threads) + _CG_QUALIFIER coalesced_group(unsigned int mask) : thread_group(__internal::Coalesced) { + _data.coalesced.mask = mask; + _data.coalesced.size = __popc(mask); + } + + public: + _CG_QUALIFIER unsigned int size() const { + return (_data.coalesced.size); + } + _CG_QUALIFIER unsigned int thread_rank() const { + return (__popc(_data.coalesced.mask & __internal::lanemask32_lt())); + } + _CG_QUALIFIER void sync() const { + __syncwarp(_data.coalesced.mask); + } + +#define COALESCED_SHFL_FUNCTION(type) \ + _CG_QUALIFIER type shfl(type var, unsigned int src_rank) const { \ + unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : \ + (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); \ + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \ + } + +#define COALESCED_SHFL_UP_FUNCTION(type) \ + _CG_QUALIFIER type shfl_up(type var, int delta) const { \ + if (size() == 32) { \ + return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); \ + } \ + unsigned lane = __fns(_data.coalesced.mask, __internal::laneid(), -(delta + 1)); \ + if (lane >= 32) lane = __internal::laneid(); \ + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \ + } + +#define COALESCED_SHFL_DOWN_FUNCTION(type) \ + _CG_QUALIFIER type shfl_down(type var, int delta) const { \ + if (size() == 32) { \ + return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); \ + } \ + unsigned int lane = __fns(_data.coalesced.mask, __internal::laneid(), delta + 1); \ + if (lane >= 32) lane = __internal::laneid(); \ + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \ + } + + COALESCED_SHFL_FUNCTION(int); + COALESCED_SHFL_FUNCTION(unsigned int); + COALESCED_SHFL_FUNCTION(long); + COALESCED_SHFL_FUNCTION(unsigned long); + COALESCED_SHFL_FUNCTION(long long); + COALESCED_SHFL_FUNCTION(unsigned long long); + COALESCED_SHFL_FUNCTION(float); + COALESCED_SHFL_FUNCTION(double); + + COALESCED_SHFL_UP_FUNCTION(int); + COALESCED_SHFL_UP_FUNCTION(unsigned int); + COALESCED_SHFL_UP_FUNCTION(long); + COALESCED_SHFL_UP_FUNCTION(unsigned long); + COALESCED_SHFL_UP_FUNCTION(long long); + COALESCED_SHFL_UP_FUNCTION(unsigned long long); + COALESCED_SHFL_UP_FUNCTION(float); + COALESCED_SHFL_UP_FUNCTION(double); + + COALESCED_SHFL_DOWN_FUNCTION(int); + COALESCED_SHFL_DOWN_FUNCTION(unsigned int); + COALESCED_SHFL_DOWN_FUNCTION(long); + COALESCED_SHFL_DOWN_FUNCTION(unsigned long); + COALESCED_SHFL_DOWN_FUNCTION(long long); + COALESCED_SHFL_DOWN_FUNCTION(unsigned long long); + COALESCED_SHFL_DOWN_FUNCTION(float); + COALESCED_SHFL_DOWN_FUNCTION(double); + +# ifdef _CG_HAS_FP16_COLLECTIVE + COALESCED_SHFL_FUNCTION(__half); + COALESCED_SHFL_UP_FUNCTION(__half); + COALESCED_SHFL_DOWN_FUNCTION(__half); + + COALESCED_SHFL_FUNCTION(__half2); + COALESCED_SHFL_UP_FUNCTION(__half2); + COALESCED_SHFL_DOWN_FUNCTION(__half2); +# endif + +#undef COALESCED_SHFL_FUNCTION +#undef COALESCED_SHFL_UP_FUNCTION +#undef COALESCED_SHFL_DOWN_FUNCTION + + _CG_QUALIFIER int any(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) != 0); + } + _CG_QUALIFIER int all(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + if (size() == 32) { + return (__ballot_sync(0xFFFFFFFF, predicate)); + } + unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate); + return (_packLanes(lane_ballot)); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + +# define COALESCED_MATCH_ANY_FUNCTION(type) \ + _CG_QUALIFIER unsigned int match_any(type val) const { \ + if (size() == 32) { \ + return (__match_any_sync(0xFFFFFFFF, val)); \ + } \ + unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); \ + return (_packLanes(lane_match)); \ + } +# define COALESCED_MATCH_ALL_FUNCTION(type) \ + _CG_QUALIFIER unsigned int match_all(type val, int &pred) const { \ + if (size() == 32) { \ + return (__match_all_sync(0xFFFFFFFF, val, &pred)); \ + } \ + unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); \ + return (_packLanes(lane_match)); \ + } + + COALESCED_MATCH_ANY_FUNCTION(int); + COALESCED_MATCH_ANY_FUNCTION(unsigned int); + COALESCED_MATCH_ANY_FUNCTION(long); + COALESCED_MATCH_ANY_FUNCTION(unsigned long); + COALESCED_MATCH_ANY_FUNCTION(long long); + COALESCED_MATCH_ANY_FUNCTION(unsigned long long); + COALESCED_MATCH_ANY_FUNCTION(float); + COALESCED_MATCH_ANY_FUNCTION(double); + + COALESCED_MATCH_ALL_FUNCTION(int); + COALESCED_MATCH_ALL_FUNCTION(unsigned int); + COALESCED_MATCH_ALL_FUNCTION(long); + COALESCED_MATCH_ALL_FUNCTION(unsigned long); + COALESCED_MATCH_ALL_FUNCTION(long long); + COALESCED_MATCH_ALL_FUNCTION(unsigned long long); + COALESCED_MATCH_ALL_FUNCTION(float); + COALESCED_MATCH_ALL_FUNCTION(double); + +# undef COALESCED_MATCH_ANY_FUNCTION +# undef COALESCED_MATCH_ALL_FUNCTION + +#endif /* !_CG_HAS_MATCH_COLLECTIVE */ + +}; + +_CG_QUALIFIER coalesced_group coalesced_threads() +{ + return (coalesced_group(__activemask())); +} + +template +class __thread_block_tile_base : public thread_group +{ + static const unsigned int numThreads = Size; + + _CG_QUALIFIER unsigned int build_mask() const { + unsigned int mask; + + if (numThreads == 32) { + mask = 0xFFFFFFFF; + } + else { + mask = (unsigned int)(-1) >> (32 - numThreads); + mask <<= (__internal::laneid() & (~(numThreads - 1))); + } + return (mask); + } + + protected: + _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) { + _data.coalesced.mask = build_mask(); + _data.coalesced.size = numThreads; + } + + public: + _CG_QUALIFIER void sync() const { + __syncwarp(build_mask()); + } + _CG_QUALIFIER unsigned int thread_rank() const { + return (__internal::laneid() & (numThreads - 1)); + } + _CG_QUALIFIER unsigned int size() const { + return (numThreads); + } + + // PTX supported collectives + _CG_QUALIFIER int shfl(int var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER int shfl_down(int var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER int shfl_up(int var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER int shfl_xor(int var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER unsigned int shfl(unsigned int var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER unsigned int shfl_down(unsigned int var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER unsigned int shfl_up(unsigned int var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER unsigned int shfl_xor(unsigned int var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER long shfl(long var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER long shfl_down(long var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER long shfl_up(long var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER long shfl_xor(long var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER unsigned long shfl(unsigned long var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER unsigned long shfl_down(unsigned long var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER unsigned long shfl_up(unsigned long var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER unsigned long shfl_xor(unsigned long var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER long long shfl(long long var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER long long shfl_down(long long var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER long long shfl_up(long long var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER long long shfl_xor(long long var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER unsigned long long shfl(unsigned long long var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER unsigned long long shfl_down(unsigned long long var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER unsigned long long shfl_up(unsigned long long var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER unsigned long long shfl_xor(unsigned long long var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER float shfl(float var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER float shfl_down(float var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER float shfl_up(float var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER float shfl_xor(float var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER double shfl(double var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER double shfl_down(double var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER double shfl_up(double var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER double shfl_xor(double var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER int any(int predicate) const { + unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate); + return (lane_ballot != 0); + } + _CG_QUALIFIER int all(int predicate) const { + unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate); + return (lane_ballot == build_mask()); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate); + return (lane_ballot >> (__internal::laneid() & (~(numThreads - 1)))); + } + +#ifdef _CG_HAS_FP16_COLLECTIVE + _CG_QUALIFIER __half shfl(__half var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER __half shfl_down(__half var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER __half shfl_up(__half var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER __half shfl_xor(__half var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } + _CG_QUALIFIER __half2 shfl(__half2 var, int srcRank) const { + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + _CG_QUALIFIER __half2 shfl_down(__half2 var, unsigned int delta) const { + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER __half2 shfl_up(__half2 var, unsigned int delta) const { + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + _CG_QUALIFIER __half2 shfl_xor(__half2 var, unsigned int laneMask) const { + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } +#endif + +#ifdef _CG_HAS_MATCH_COLLECTIVE + _CG_QUALIFIER unsigned int match_any(int val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(unsigned int val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(long val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(unsigned long val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(long long val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(unsigned long long val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(float val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_any(double val) const { + unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + + _CG_QUALIFIER unsigned int match_all(int val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(unsigned int val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(long val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(unsigned long val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(long long val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(unsigned long long val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(float val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } + _CG_QUALIFIER unsigned int match_all(double val, int &pred) const { + unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); + } +#endif + +}; + +/** + * class thread_block_tile + * + * Statically-sized group type, representing one tile of a thread block. + * The only specializations currently supported are those with native + * hardware support (1/2/4/8/16/32) + * + * This group exposes warp-synchronous builtins. + * Constructed via tiled_partition(class thread_block); + */ +template +class thread_block_tile; +template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { }; +template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { }; +template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { }; +template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { }; +template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { }; +template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { }; + +/** + * Outer level API calls + * void sync(GroupT) - see .sync() + * void thread_rank(GroupT) - see .thread_rank() + * void group_size(GroupT) - see .size() + */ +template _CG_QUALIFIER void sync(GroupT const &g) +{ + g.sync(); +} + +template _CG_QUALIFIER unsigned int thread_rank(GroupT const& g) +{ + return (g.thread_rank()); +} + +template _CG_QUALIFIER unsigned int group_size(GroupT const &g) +{ + return (g.size()); +} + +/** + * .sync() + * + * Executes a barrier across the group + * + * Implements both a compiler fence and an architectural fence to prevent, + * memory reordering around the barrier. + */ +_CG_QUALIFIER void thread_group::sync() const +{ + if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) { + static_cast(this)->sync(); + } + else { + static_cast(this)->sync(); + } +} + +/** + * .size() + * + * Returns the total number of threads in the group. + */ +_CG_QUALIFIER unsigned int thread_group::size() const +{ + if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) { + return (static_cast(this)->size()); + } + else { + return (static_cast(this)->size()); + } +} + +/** + * .thread_rank() + * + * Returns the linearized rank of the calling thread along the interval [0, size()). + */ +_CG_QUALIFIER unsigned int thread_group::thread_rank() const +{ + if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) { + return (static_cast(this)->thread_rank()); + } + else { + return (static_cast(this)->thread_rank()); + } +} + +/** + * tiled_partition + * + * The tiled_partition(parent, tilesz) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will + * be created where threads having identical k = (thread_rank(parent)/tilesz) + * will be members of the same subgroup. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to power-of-two sized subgorup instances of at most + * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be + * tiled_partition() in _CG_VERSION 1000. + */ +_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz) +{ + if (parent._data.type == __internal::Coalesced || parent._data.type == __internal::CoalescedTile) { + return (static_cast(parent)._get_tiled_threads(tilesz)); + } + else { + return (static_cast(parent)._get_tiled_threads(tilesz)); + } +} +// Thread block type overload: returns a basic thread_group for now (may be specialized later) +_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} +// Coalesced group type overload: retains its ability to stay coalesced +_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +namespace __internal { + + // For specializing on different tiled_partition template arguments + template + struct tiled_partition_impl; + + template + struct tiled_partition_impl : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(thread_block const &) : thread_block_tile() {} + }; + template + struct tiled_partition_impl > : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(thread_block_tile<32> const&) : thread_block_tile() {} + }; + template + struct tiled_partition_impl > : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(thread_block_tile<16> const&) : thread_block_tile() {} + }; + template + struct tiled_partition_impl > : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(thread_block_tile<8> const&) : thread_block_tile() {} + }; + template + struct tiled_partition_impl > : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(thread_block_tile<4> const&) : thread_block_tile() {} + }; + template + struct tiled_partition_impl > : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(thread_block_tile<2> const&) : thread_block_tile() {} + }; + template <> + struct tiled_partition_impl<1, thread_block_tile<1> > : public thread_block_tile<1> { + _CG_QUALIFIER tiled_partition_impl(thread_block_tile<1> const&) : thread_block_tile<1>() {} + }; + +}; + +/** + * tiled_partition + * + * The tiled_partition(parent) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)/tilesz) subgroups will be created, + * therefore the parent group size must be evenly divisible by the tilesz. + * The allow parent groups are thread_block or thread_block_tile. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to native hardware sizes, 1/2/4/8/16/32. + * The size(parent) must be greater than the template Size parameter + * otherwise the results are undefined. + */ +template +_CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) +{ + return (__internal::tiled_partition_impl(g)); +} + +_CG_END_NAMESPACE + +# endif /* ! (__cplusplus, __CUDACC__) */ + +#endif /* !_COOPERATIVE_GROUPS_H_ */ diff --git a/GraphBLAS/CUDA/templates/cooperative_groups_helpers.h b/GraphBLAS/CUDA/templates/cooperative_groups_helpers.h new file mode 100755 index 0000000000..f1c499f62e --- /dev/null +++ b/GraphBLAS/CUDA/templates/cooperative_groups_helpers.h @@ -0,0 +1,286 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* +** Define: _CG_VERSION +*/ +# define _CG_VERSION 1000 + +/* +** Define: _CG_ABI_VERSION +*/ +# ifndef _CG_ABI_VERSION +# define _CG_ABI_VERSION 1 +# endif + +/* +** Define: _CG_ABI_EXPERIMENTAL +** Desc: If enabled, sets all features enabled (ABI-breaking or experimental) +*/ +# if defined(_CG_ABI_EXPERIMENTAL) +# endif + +# define _CG_CONCAT_INNER(x, y) x ## y +# define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y) +# define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION) + +# define _CG_BEGIN_NAMESPACE \ + namespace cooperative_groups { namespace _CG_NAMESPACE { +# define _CG_END_NAMESPACE \ + }; using namespace _CG_NAMESPACE; }; + +# if !defined(_CG_STATIC_QUALIFIER) +# define _CG_STATIC_QUALIFIER static __forceinline__ __device__ +# endif +# if !defined(_CG_QUALIFIER) +# define _CG_QUALIFIER __forceinline__ __device__ +# endif + +# if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__) +# define _CG_HAS_GRID_GROUP +# endif +# if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__) +# define _CG_HAS_MULTI_GRID_GROUP +# endif +# if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__) +# define _CG_HAS_MATCH_COLLECTIVE +# endif +// Has __half and __half2 +// Only usable if you include the cuda_fp16.h extension, and +// _before_ including cooperative_groups.h +# ifdef __CUDA_FP16_TYPES_EXIST__ +# define _CG_HAS_FP16_COLLECTIVE +# endif + +/* +** Define: CG_DEBUG +** What: Enables various runtime safety checks +*/ +#if defined(__CUDACC_DEBUG__) && !defined(_CG_DEBUG) +# define _CG_DEBUG 1 +#endif + +#if defined(_CG_DEBUG) && (_CG_DEBUG == 1) && !defined(NDEBUG) +# include +# define _CG_ASSERT(x) assert((x)); +# define _CG_ABORT() assert(0); +#else +# define _CG_ASSERT(x) +# define _CG_ABORT() __trap(); +#endif + +_CG_BEGIN_NAMESPACE + +namespace __internal { + + enum groupType { + CoalescedTile, + Coalesced, + ThreadBlock, + Grid, + MultiGrid, + }; + +#if defined(_CG_HAS_GRID_GROUP) + + namespace grid { + + _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle() + { + return (cudaCGGetIntrinsicHandle(cudaCGScopeGrid)); + } + + _CG_STATIC_QUALIFIER void sync(const unsigned long long handle) + { + cudaCGSynchronizeGrid(handle, 0); + } + + _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle) + { + return (blockDim.z * gridDim.z) * + (blockDim.y * gridDim.y) * + (blockDim.x * gridDim.x); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle) + { + unsigned int blkIdx = ((blockIdx.z * gridDim.y * gridDim.x) + + (blockIdx.y * gridDim.x) + + blockIdx.x); + return (blkIdx * (blockDim.x * blockDim.y * blockDim.z) + + ((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + + threadIdx.x)); + } + + _CG_STATIC_QUALIFIER dim3 grid_dim() + { + return (dim3(gridDim.x, gridDim.y, gridDim.z)); + } + }; + +#endif + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + + namespace multi_grid { + + _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle() + { + return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid)); + } + + _CG_STATIC_QUALIFIER void sync(const unsigned long long handle) + { + cudaError_t err = cudaCGSynchronize(handle, 0); + } + + _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle) + { + unsigned int numThreads = 0; + cudaCGGetSize(&numThreads, NULL, handle); + return numThreads; + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle) + { + unsigned int threadRank = 0; + cudaCGGetRank(&threadRank, NULL, handle); + return threadRank; + } + + _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle) + { + unsigned int gridRank = 0; + cudaCGGetRank(NULL, &gridRank, handle); + return gridRank; + } + + _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle) + { + unsigned int numGrids = 0; + cudaCGGetSize(NULL, &numGrids, handle); + return numGrids; + } + + }; + +#endif + + namespace cta { + + _CG_STATIC_QUALIFIER void sync() + { + __barrier_sync(0); + } + + _CG_STATIC_QUALIFIER unsigned int size() + { + return (blockDim.x * blockDim.y * blockDim.z); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return ((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + + threadIdx.x); + } + + _CG_STATIC_QUALIFIER dim3 group_index() + { + return (dim3(blockIdx.x, blockIdx.y, blockIdx.z)); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + return (dim3(threadIdx.x, threadIdx.y, threadIdx.z)); + } + + _CG_STATIC_QUALIFIER dim3 block_dim() + { + return (dim3(blockDim.x, blockDim.y, blockDim.z)); + } + + }; + + _CG_STATIC_QUALIFIER unsigned int laneid() + { + unsigned int laneid; + asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid)); + return laneid; + } + + _CG_STATIC_QUALIFIER unsigned int warpsz() + { + unsigned int warpSize; + asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warpSize)); + return warpSize; + } + + _CG_STATIC_QUALIFIER unsigned int lanemask32_eq() + { + unsigned int lanemask32_eq; + asm volatile("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq)); + return (lanemask32_eq); + } + + _CG_STATIC_QUALIFIER unsigned int lanemask32_lt() + { + unsigned int lanemask32_lt; + asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt)); + return (lanemask32_lt); + } + + _CG_STATIC_QUALIFIER void abort() + { + _CG_ABORT(); + } + +}; // !Namespace internal + +_CG_END_NAMESPACE diff --git a/GraphBLAS/CUDA/templates/denseDotProduct.cu b/GraphBLAS/CUDA/templates/denseDotProduct.cu new file mode 100644 index 0000000000..62841ed4b2 --- /dev/null +++ b/GraphBLAS/CUDA/templates/denseDotProduct.cu @@ -0,0 +1,202 @@ +//------------------------------------------------------------------------------ +// denseDotProduct.cu +//------------------------------------------------------------------------------ + +// The denseDotProduct CUDA kernel produces the semi-ring dot product of two +// vectors of types T1 and T2 and common size n, to a vector odata of type T3. +// ie. we want to produce dot(x,y) in the sense of the given semi-ring. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. +// Each threadblock owns s*8 contiguous items in the input data. + +// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)]. It's job +// is to reduce this data to a scalar, and write it to g_odata [b]. + +#include +#include "mySemiRing.h" +#include + +using namespace cooperative_groups; + +template< typename T3, int tile_sz> +__inline__ __device__ +T3 warp_ReduceSum(thread_block_tile g, T3 val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + T3 fold = g.shfl_down( val, i); + val = ADD( val, fold ); + } + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T3 block_ReduceSum(thread_block g, T3 val) +{ + static __shared__ T3 shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + thread_block_tile tile = tiled_partition(g); + + // Each warp performs partial reduction + val = warp_ReduceSum(tile, val); + + if (lane==0) shared[wid]=val; // Write reduced value to shared memory + + __syncthreads(); // Wait for all partial reductions + + //read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : (T3)MONOID_IDENTITY3; + + + if (wid==0) val = warp_ReduceSum(tile,val); //Final reduce within first warp + + return val; +} + +template< typename T1, typename T2, typename T3> +__global__ void denseDotProduct +( + T1 *g_xdata, // array of size n, type T1 + T2 *g_ydata, // array of size n, type T2 + T3 *g_odata, // array of size grid.x, type T3 + unsigned int n +) +{ + // set thread ID + unsigned int tid = threadIdx.x ; + + // this threadblock b owns g_idata [block_start ... block_end-1] + unsigned long int s = blockDim.x ; + unsigned long int b = blockIdx.x ; + unsigned long int block_start = b * s * 8 ; + unsigned long int block_end = (b + 1) * s * 8 ; + + /* + if (tid == 0) + { + printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ; + } + */ + + /* + if (tid == 0 && b == 0) + { + printf ("type is size %d\n", sizeof (T)) ; + for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ; + printf ("\n") ; + } + */ + + // each thread tid reduces its result into sum + T3 sum; + + // nothing to do + if (block_start > block_end) { return ; } + + // convert global data pointer to the local pointer of this block + T1 *xdata = g_xdata + block_start ; + T2 *ydata = g_ydata + block_start ; + + T1 x0, x1, x2, x3, x4, x5, x6, x7 ; + T2 y0, y1, y2, y3, y4, y5, y6, y7 ; + + if (block_end <= n) + { + // unrolling 8 + x0 = xdata [tid] ; + x1 = xdata [tid + s] ; + x2 = xdata [tid + 2 * s] ; + x3 = xdata [tid + 3 * s] ; + x4 = xdata [tid + 4 * s] ; + x5 = xdata [tid + 5 * s] ; + x6 = xdata [tid + 6 * s] ; + x7 = xdata [tid + 7 * s] ; + + y0 = ydata [tid] ; + y1 = ydata [tid + s] ; + y2 = ydata [tid + 2 * s] ; + y3 = ydata [tid + 3 * s] ; + y4 = ydata [tid + 4 * s] ; + y5 = ydata [tid + 5 * s] ; + y6 = ydata [tid + 6 * s] ; + y7 = ydata [tid + 7 * s] ; + /* + if (b == 0) + { + printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid, + (double) x0, (double) x1, (double) x2, (double) x3, + (double) x4, (double) x5, (double) x6, (double) x7) ; + } + */ + + } + else + { + // the last block has size less than 8*s + #define XDATA(i) ((i < lastblocksize) ? xdata [i] : MONOID_IDENTITY1) + #define YDATA(i) ((i < lastblocksize) ? ydata [i] : MONOID_IDENTITY2) + int lastblocksize = n - block_start ; + x0 = XDATA (tid) ; + x1 = XDATA (tid + s) ; + x2 = XDATA (tid + 2 * s) ; + x3 = XDATA (tid + 3 * s) ; + x4 = XDATA (tid + 4 * s) ; + x5 = XDATA (tid + 5 * s) ; + x6 = XDATA (tid + 6 * s) ; + x7 = XDATA (tid + 7 * s) ; + + y0 = YDATA (tid) ; + y1 = YDATA (tid + s) ; + y2 = YDATA (tid + 2 * s) ; + y3 = YDATA (tid + 3 * s) ; + y4 = YDATA (tid + 4 * s) ; + y5 = YDATA (tid + 5 * s) ; + y6 = YDATA (tid + 6 * s) ; + y7 = YDATA (tid + 7 * s) ; + } + + //work [tid] = mul(x0,y0) + mul(x1,y1) + mul(x2,y2) + mul(x3,y3) + // + mul(x4,y4) + mul(x5,y5) + mul(x6,y6)+ mul(x7,y7) ; + sum = ADD( MUL(x0,y0) , ADD( MUL(x1,y1) , ADD( MUL(x2,y2), + ADD( MUL(x3,y3) , ADD( MUL(x4,y4) , ADD( MUL(x5,y5), + ADD( MUL(x6,y6) , MUL(x7,y7)))))))) ; + + /* + if (b == 0) + { + printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid, + (double) x0, (double) x1, (double) x2, (double) x3, + (double) x4, (double) x5, (double) x6, (double) x7) ; + } + + if (b == 0) + { + printf ("block zero: here is tid %d result %g is %g\n", + tid, sum, + (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ; + } + */ + + __syncthreads ( ) ; + + //-------------------------------------------------------------------------- + // reduce per-thread sums to a single scalar + //-------------------------------------------------------------------------- + + sum = block_ReduceSum( this_thread_block(), sum); + + // write result for this block to global mem + if (tid == 0) + { + printf ("final %d : %g\n", b, (T3) sum) ; + g_odata [b] = sum ; + } +} + diff --git a/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu b/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu new file mode 100644 index 0000000000..02953cbd9d --- /dev/null +++ b/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu @@ -0,0 +1,108 @@ +//------------------------------------------------------------------------------ +// reduceUnrolled.cu +//------------------------------------------------------------------------------ + +// The reduceUnrolled CUDA kernel reduces an array g_idata of size n, of any +// type T, to an array g_odata of size grid.x. Each threadblock (blockIdx.x) +// reduces its portion of g_idata to a single scalar, g_odata [blockIdx.x]. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. +// Each threadblock owns s*8 contiguous items in the input data. + +// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)]. It's job +// is to reduce this data to a scalar, and write it to g_odata [b]. + +#define GB_KERNEL +#include +#include +#include +#include "mySemiRing.h" + +using namespace cooperative_groups; + +template< typename T, int tile_sz> +__inline__ __device__ +T warp_ReduceSum( thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) { + T fold = g.shfl_down( val, i); + //printf("thd%d %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold)); + val = GB_ADD( val, fold ); + } + //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x, val); + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T block_ReduceSum(thread_block g, T val) +{ + static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x & 31 ; // % warpSize; + int wid = threadIdx.x >> 5 ; // / warpSize; + thread_block_tile tile = tiled_partition( g ); + + // Each warp performs partial reduction + val = warp_ReduceSum( tile, val); + + // Wait for all partial reductions + if (lane==0) { + //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val); + shared[wid] = val; // Write reduced value to shared memory + //printf("thd%d stored warp%d sum %d\n", threadIdx.x, wid, val); + } + __syncthreads(); // Wait for all partial reductions + + if (wid > 0 ) return val; + //read from shared memory only if that warp existed + else { + val = (threadIdx.x < (blockDim.x / warpSize) ) ? shared[lane] : GB_IDENTITY ; + //if (lane < (blockDim.x/ warpSize) ) printf("thd%d warp%d loaded val = %d\n", threadIdx.x, lane, val); + val = warp_ReduceSum( tile, val); //Final reduce within first warp + } + + return val; +} + +template< typename T> +__global__ void reduceNonZombiesWarp +( + int64_t *index, // array of size n + T *g_idata, // array of size n + T *g_odata, // array of size grid.x + unsigned int N +) +{ + // set thread ID + int tid = threadIdx.x ; + + // each thread tid reduces its result into sum + T sum = (T) GB_IDENTITY; + + for(int i = blockIdx.x * blockDim.x + threadIdx.x; + i < N; + i += blockDim.x * gridDim.x) { + if ( index[i] < 0) continue; + T fold = g_idata[i]; + sum = GB_ADD( sum, fold ); + } + //printf("thd%d sum is %d\n", threadIdx.x + blockDim.x*blockIdx.x, sum); + __syncthreads(); + //-------------------------------------------------------------------------- + // reduce work [0..s-1] to a single scalar + //-------------------------------------------------------------------------- + // this assumes blockDim is a multiple of 32 + sum = block_ReduceSum< T, 32 >( this_thread_block(), sum) ; + + // write result for this block to global mem + if (tid == 0) + { + g_odata [blockIdx.x] = sum ; + } +} + diff --git a/GraphBLAS/CUDA/templates/reduceUnrolled.cu b/GraphBLAS/CUDA/templates/reduceUnrolled.cu new file mode 100644 index 0000000000..da2e3e3eed --- /dev/null +++ b/GraphBLAS/CUDA/templates/reduceUnrolled.cu @@ -0,0 +1,187 @@ +//------------------------------------------------------------------------------ +// reduceUnrolled.cu +//------------------------------------------------------------------------------ + +// The reduceUnrolled CUDA kernel reduces an array g_idata of size n, of any +// type T, to an array g_odata of size grid.x. Each threadblock (blockIdx.x) +// reduces its portion of g_idata to a single scalar, g_odata [blockIdx.x]. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. +// Each threadblock owns s*8 contiguous items in the input data. + +// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)]. It's job +// is to reduce this data to a scalar, and write it to g_odata [b]. + +#include "myOp.h" +#include +#include "GB_cuda.h" + +GrB_Matrix Stuff ; // hack hack hack + +using namespace cooperative_groups; + +template< typename T, int tile_sz> +__inline__ __device__ +T warp_ReduceSum( thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) { + T fold = g.shfl_down( val, i); + //printf("thd%d %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold)); + val = OP( val, fold ); + } + //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x, val); + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T block_ReduceSum(thread_block g, T val) +{ + static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + thread_block_tile tile = tiled_partition( g ); + + // Each warp performs partial reduction + val = warp_ReduceSum( tile, val); + + // Wait for all partial reductions + if (lane==0) { + //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val); + shared[wid]=val; // Write reduced value to shared memory + //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val); + } + g.sync(); // Wait for all partial reductions + + if (wid > 0 || gridDim.x == 1 ) return val; + //read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : MONOID_IDENTITY; + //printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val); + + + if (wid==0) val = warp_ReduceSum( tile, val); //Final reduce within first warp + + return val; +} + +template< typename T> +__global__ void reduceUnrolled +( + T *g_idata, // array of size n + T *g_odata, // array of size grid.x + unsigned int n +) +{ + // set thread ID + unsigned int tid = threadIdx.x ; + + // this threadblock b owns g_idata [block_start ... block_end-1] + unsigned long int s = blockDim.x ; + unsigned long int b = blockIdx.x ; + unsigned long int block_start = b * s * 8 ; + unsigned long int block_end = (b + 1) * s * 8 ; + + /* + if (tid == 0) + { + printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ; + } + */ + + /* + if (tid == 0 && b == 0) + { + printf ("type is size %d\n", sizeof (T)) ; + for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ; + printf ("\n") ; + } + */ + + // nothing to do + if (block_start > block_end) { if (tid == 0) printf ("bye!\n") ; return ; } + + // convert global data pointer to the local pointer of this block + T *idata = g_idata + block_start ; + + T x0, x1, x2, x3, x4, x5, x6, x7 ; + + if (block_end <= n) + { + // unrolling 8 + x0 = idata [tid] ; + x1 = idata [tid + s] ; + x2 = idata [tid + 2 * s] ; + x3 = idata [tid + 3 * s] ; + x4 = idata [tid + 4 * s] ; + x5 = idata [tid + 5 * s] ; + x6 = idata [tid + 6 * s] ; + x7 = idata [tid + 7 * s] ; + + /* + if (b == 0) + { + printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid, + (double) x0, (double) x1, (double) x2, (double) x3, + (double) x4, (double) x5, (double) x6, (double) x7) ; + } + */ + + } + else + { + // the last block has size less than 8*s + #define IDATA(i) ((i < lastblocksize) ? idata [i] : MONOID_IDENTITY) + int lastblocksize = n - block_start ; + x0 = IDATA (tid) ; + x1 = IDATA (tid + s) ; + x2 = IDATA (tid + 2 * s) ; + x3 = IDATA (tid + 3 * s) ; + x4 = IDATA (tid + 4 * s) ; + x5 = IDATA (tid + 5 * s) ; + x6 = IDATA (tid + 6 * s) ; + x7 = IDATA (tid + 7 * s) ; + } + T sum; + //work [tid] = x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ; + sum = OP( x0 ,OP( x1, OP( x2, OP( x3, + OP( x4, OP( x5 , OP( x6 , x7))))))) ; + + /* + if (b == 0) + { + printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid, + (double) x0, (double) x1, (double) x2, (double) x3, + (double) x4, (double) x5, (double) x6, (double) x7) ; + } + + if (b == 0) + { + printf ("block zero: here is tid %d result %g is %g\n", + tid, (double) work [tid], + (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ; + } + */ + + __syncthreads ( ) ; + + //-------------------------------------------------------------------------- + // reduce work [0..s-1] to a single scalar + //-------------------------------------------------------------------------- + + // This assumes that s is a power of 2 and <= 1024, and at least 32 + // This assumes blockDim is a multiple of 32 + sum = block_ReduceSum( this_thread_block(), sum); + + // write result for this block to global mem + if (tid == 0) + { + // printf ("final %d : %g\n", b, (double) work [0]) ; + g_odata [b] = sum ; + } +} + diff --git a/GraphBLAS/CUDA/templates/reduceWarp.cu b/GraphBLAS/CUDA/templates/reduceWarp.cu new file mode 100644 index 0000000000..000733b522 --- /dev/null +++ b/GraphBLAS/CUDA/templates/reduceWarp.cu @@ -0,0 +1,103 @@ +//------------------------------------------------------------------------------ +// reduceUnrolled.cu +//------------------------------------------------------------------------------ + +// The reduceUnrolled CUDA kernel reduces an array g_idata of size n, of any +// type T, to an array g_odata of size grid.x. Each threadblock (blockIdx.x) +// reduces its portion of g_idata to a single scalar, g_odata [blockIdx.x]. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. +// Each threadblock owns s*8 contiguous items in the input data. + +// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)]. It's job +// is to reduce this data to a scalar, and write it to g_odata [b]. + +#include "mySemiRing.h" +#include + +using namespace cooperative_groups; + +template< typename T, int tile_sz> +__inline__ __device__ +T warp_ReduceSum( thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) { + T fold = g.shfl_down( val, i); + //printf("thd%d %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold)); + val = OP( val, fold ); + } + //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x, val); + return val; // note: only thread 0 will return full sum +} + +template +__inline__ __device__ +T block_ReduceSum(thread_block g, T val) +{ + static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + thread_block_tile tile = tiled_partition( g ); + + // Each warp performs partial reduction + val = warp_ReduceSum( tile, val); + + // Wait for all partial reductions + if (lane==0) { + //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val); + shared[wid]=val; // Write reduced value to shared memory + //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val); + } + __syncthreads(); // Wait for all partial reductions + + if (wid > 0 || gridDim.x == 1 ) return val; + //read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : MONOID_IDENTITY; + //printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val); + + + if (wid==0) val = warp_ReduceSum( tile, val); //Final reduce within first warp + + return val; +} + +template< typename T> +__global__ void reduceWarp +( + T *g_idata, // array of size n + T *g_odata, // array of size grid.x + unsigned int N +) +{ + // set thread ID + unsigned int tid = threadIdx.x ; + + // each thread tid reduces its result into sum + T sum = (T) MONOID_IDENTITY; + + for(int i = blockIdx.x * blockDim.x + threadIdx.x; + i < N; + i += blockDim.x * gridDim.x) { + sum = OP( sum, g_idata[i]); + } + //printf("thd%d sum is %d\n", threadIdx.x + blockDim.x*blockIdx.x, sum); + __syncthreads(); + //-------------------------------------------------------------------------- + // reduce work [0..s-1] to a single scalar + //-------------------------------------------------------------------------- + // this assumes blockDim is a multiple of 32 + sum = block_ReduceSum( this_thread_block(), sum); + + // write result for this block to global mem + if (tid == 0) + { + // printf ("final %d : %g\n", b, (double) work [0]) ; + g_odata [blockIdx.x] = sum ; + } +} + diff --git a/GraphBLAS/CUDA/templates/sparseDotProduct.cu b/GraphBLAS/CUDA/templates/sparseDotProduct.cu new file mode 100644 index 0000000000..ee1943b21c --- /dev/null +++ b/GraphBLAS/CUDA/templates/sparseDotProduct.cu @@ -0,0 +1,189 @@ +//------------------------------------------------------------------------------ +// sparseDotProduct_merge_path.cu +//------------------------------------------------------------------------------ + +// The sparseDotProduct CUDA kernel produces the semi-ring dot product of two +// sparse vectors of types T1 and T2 and common index space size n, to a scalar +// odata of type T3. The vectors are sparse, with different numbers of non-zeros. +// ie. we want to produce dot(x,y) in the sense of the given semi-ring. + +// This version uses a merge-path algorithm, when the sizes g_xnz and g_ynz are +// relatively close in size, but for any size of N. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number +// of active threads = min( min(g_xnz, g_ynz), 32) + +// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. Its job +// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot +// product on those items in the intersection, and finally reduce this data to a scalar, +// on exit write it to g_odata [b]. + +#include +#include +#include "mySemiRing.h" + +using namespace cooperative_groups; + +template< typename T, int tile_sz> +__device__ T reduce_sum(thread_block_tile g, T val) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + val = ADD( val, g.shfl_down(val,i) ); + //if (g.thread_rank() ==0) + // printf("in reduce_sum i=%i val = %f\n", i, val); + } + return val; // note: only thread 0 will return full sum +} + +#define INTMIN( A, B) ( (A) < (B) ) ? (A) : (B) +#define INTMAX( A, B) ( (A) > (B) ) ? (A) : (B) +#define intersects_per_thread 4 + +template< typename T1, typename T2, typename T3> +__global__ void sparseDotProduct +( + unsigned int g_xnz, // Number of non-zeros in x + unsigned int *g_xi, // Non-zero indices in x, size xnz + T1 *g_xdata, // array of size xnz, type T1 + unsigned int g_ynz, // Number of non-zeros in y + unsigned int *g_yi, // Non-zero indices in y, size ynz + T2 *g_ydata, // array of size ynz, type T2 + T3 *g_odata // array of size grid.x, type T3 +) +{ + // set thread ID + unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x; + unsigned int tid = threadIdx.x; + + unsigned long int b = blockIdx.x ; + + // total items to be inspected + unsigned int nxy = (g_xnz + g_ynz); + + //largest possible number of intersections is the smaller nz + unsigned int n_intersect = INTMIN( g_xnz, g_ynz); + + //we want more than one intersection per thread + unsigned int parts = (n_intersect+ intersects_per_thread -1)/ intersects_per_thread; + + unsigned int work_per_thread = (nxy +parts -1)/parts; + unsigned int diag = INTMIN( work_per_thread*tid_global, nxy); + unsigned int diag_end = INTMIN( diag + work_per_thread, nxy); + //printf(" thd%d parts = %u wpt = %u diag, diag_end = %u,%u\n",tid, parts, work_per_thread, diag, diag_end); + + unsigned int x_min = INTMAX( (int)(diag - g_ynz), 0); + unsigned int x_max = INTMIN( diag, g_xnz); + + //printf("start thd%u x_min = %u x_max = %u\n", tid_global, x_min,x_max); + while ( x_min < x_max) { //binary search for correct diag break + unsigned int pivot = (x_min +x_max)/2; + if ( g_xi[pivot] < g_yi[ diag -pivot -1]) { + x_min = pivot +1; + } + else { + x_max = pivot; + } + } + int xcoord = x_min; + int ycoord = diag -x_min -1; + if (( diag > 0) &&(diag < (g_xnz+g_ynz)) && (g_xi[xcoord] == g_yi[ycoord]) ) { + diag--; //adjust for intersection incrementing both pointers + } + // two start points are known now + int x_start = xcoord; + int y_start = diag -xcoord; + + //if (x_start != y_start) + // printf("start thd%u xs,ys = %i,%i\n", tid_global, x_start, y_start); + + x_min = INTMAX( (int)(diag_end - g_ynz), 0); + x_max = INTMIN( diag_end, g_xnz); + + while ( x_min < x_max) { + unsigned int pivot = (x_min +x_max)/2; + //printf("thd%u pre_sw piv=%u diag_e = %u xmin,xmax=%u,%u\n", tid_global, pivot, diag_end,x_min, x_max); + if ( g_xi[pivot] < g_yi[ diag_end -pivot -1]) { + x_min = pivot +1; + } + else { + x_max = pivot; + } + //printf("thd%u piv=%u xmin,xmax = %u,%u\n", tid_global, pivot, x_min, x_max); + } + xcoord = x_min; + ycoord = diag_end -x_min -1; + if ( (diag_end < (g_xnz+g_ynz)) && (g_xi[xcoord] == g_yi[ycoord]) ) { + diag--; //adjust for intersection incrementing both pointers + } + // two end points are known now + int x_end = xcoord; + int y_end = diag_end - xcoord; + + /* + if (tid == 0 && b == 0) { + printf ("type1 is size %d\n", sizeof (T1)) ; + for (int k = 0 ; k < g_xnz ; k++) printf ("%4d: %g,", k, (T1) g_xdata [k]) ; + printf ("\n") ; + printf ("type2 is size %d\n", sizeof (T2)) ; + for (int k = 0 ; k < g_ynz ; k++) printf ("%4d: %g,", k, (T2) g_ydata [k]) ; + printf ("\n") ; + } + __syncthreads(); + */ + + T3 sum = (T3) 0; + //printf(" thd%u has init value %f\n",tid, sum); + + // nothing to do + if ( (x_start >= x_end) || (y_start >= y_end) ) { return ; } + + //merge-path dot product + int k = x_start; + int l = y_start; + while ( k < x_end && l < y_end ) + { + if ( g_xi[k] < g_yi[l] ) k += 1; + else if ( g_xi[k] > g_yi[l] ) l += 1; + else { + //printf(" thd%d ix at %u \n",tid_global,g_xi[k]); + //printf(" sum += %f * %f \n",tid,g_xdata[k],g_ydata[l]); + //sum = ADD( sum, MUL( g_xdata[k], g_ydata[l])); + MULADD( sum, g_xdata[k], g_ydata[l]); + //printf(" thd%u work value = %f\n",tid_global, sum); + k+= 1; + l+= 1; + } + + } + + __syncthreads ( ) ; + /* + if (1) + { + printf ("thd%u done with intersect and multiply, val = %f\n",tid_global, sum) ; + } + __syncthreads ( ) ; + */ + + //-------------------------------------------------------------------------- + // reduce sum per-thread values to a single scalar + //-------------------------------------------------------------------------- + // Using tile size fixed at compile time, we don't need shared memory + #define tile_sz 32 + thread_block_tile tile = tiled_partition( this_thread_block()); + T3 block_sum = reduce_sum(tile, sum); + + // write result for this block to global mem + if (tid == 0) + { + printf ("final %d : %g\n", b, block_sum) ; + g_odata [b] = block_sum ; + } +} + diff --git a/GraphBLAS/CUDA/templates/stuff.cu b/GraphBLAS/CUDA/templates/stuff.cu new file mode 100644 index 0000000000..9241fd1ebb --- /dev/null +++ b/GraphBLAS/CUDA/templates/stuff.cu @@ -0,0 +1,9 @@ + val = ADD( val, g.shfl_down( val, i) ); + + + t = g.shfl_down( val, i) ; + val = ADD( val, t ); + + GB_ADD (val, t) ; // statment val = GB_ADD_FUNCTION (val, t) + + diff --git a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.hpp b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.hpp new file mode 100644 index 0000000000..197d24e5ec --- /dev/null +++ b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.hpp @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Test AxB_dot3_cuda kernels +// Using data generators and test classes, cover +// all 12 cases for the masked GEMM ( C, M, A, B) in GraphBLAS +// Tests Semirings, data types and a range of data input sizes and shapes +// Connects to the jitFactory for launches. + +#include +#include +#include +#include +#include +#include "jitTestFactory.hpp" +#include "gtest/gtest.h" + +//Test instances and groupings +#include "AxB_dot3_test_instances.hpp" + diff --git a/GraphBLAS/CUDA/test/AxB_dot3_test_instances.hpp b/GraphBLAS/CUDA/test/AxB_dot3_test_instances.hpp new file mode 100644 index 0000000000..1719df741c --- /dev/null +++ b/GraphBLAS/CUDA/test/AxB_dot3_test_instances.hpp @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: Apache-2.0 +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} +TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);} +TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS"; test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);} diff --git a/GraphBLAS/CUDA/test/GpuTimer.h b/GraphBLAS/CUDA/test/GpuTimer.h new file mode 100644 index 0000000000..63c3e1aaf4 --- /dev/null +++ b/GraphBLAS/CUDA/test/GpuTimer.h @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +#ifndef __GPU_TIMER_H__ +#define __GPU_TIMER_H__ + +struct GpuTimer +{ + cudaEvent_t start; + cudaEvent_t stop; + + GpuTimer() + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + } + + ~GpuTimer() + { + cudaEventDestroy(start); + cudaEventDestroy(stop); + } + + void Start() + { + cudaEventRecord(start, 0); + } + + void Stop() + { + cudaEventRecord(stop, 0); + } + + float Elapsed() + { + float elapsed; + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsed, start, stop); + return elapsed; + } +}; + +#endif /* __GPU_TIMER_H__ */ diff --git a/GraphBLAS/CUDA/test/Makefile b/GraphBLAS/CUDA/test/Makefile new file mode 100644 index 0000000000..289e5da8c8 --- /dev/null +++ b/GraphBLAS/CUDA/test/Makefile @@ -0,0 +1,133 @@ +#------------------------------------------------------------------------------- +# GraphBLAS/CUDA/Makefile +#------------------------------------------------------------------------------- + +# cuda 10.1+ is assumed + +all: cudaTest + + +LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs -lpthreads -lcudadevrt -lcudart -lnvrtc +INC += -I$(CUDA_DIR)/include -I../ -I../../Source -I../../Include -I../../Source/Template -I$(TEMPLATE_DIR) -Igoogletest/include + +CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75\ + --relocatable-device-code true --device-c\ + --std=c++11 -Xcompiler -fPIC + + +%.o: %.cu + nvcc -c $(I) $(CUDA_OPTS) $(INC) -o $@ $< + +config: + nvidia-smi + nvcc --version + @echo " " + @echo "SO_NAME: " $(SO_NAME) + @echo "SO_OPTS: " $(SO_OPTS) + @echo "LIBS: " $(LIBS) + @echo "CUDA_OPTS: " $(CUDA_OPTS) + @echo "SRC: " $(SRC) + @echo "OBJ: " $(OBJ) + @echo "I: " $(I) + @echo " " + gcc --version + icc --version + +clean: + rm -f *.o + rm -f stringify + rm -f cudaTest + rm -f testJit +.PHONY: clean + +distclean: clean + rm -f *.so *.a + +purge: distclean + +################################################################################ + +GXX ?= g++ +GCC ?= gcc +DOXYGEN ?= doxygen +CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80 +CFLAGS ?= -O2 -g -std=c11 + +CXX11 ?= 1 + +CUDA_DIR ?= /usr/local/cuda + +CXXFLAGS += -pthread + +ifeq ($(CXX11),1) + CXXFLAGS += -std=c++14 +endif + +EMBED_BEGIN = -rdynamic -Wl,-b,binary, +EMBED_END = ,-b,default + +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + CXXFLAGS += -D LINUX + CUDA_LIB_DIR = $(CUDA_DIR)/lib64 +else ifeq ($(UNAME_S),Darwin) + CUDA_LIB_DIR = $(CUDA_DIR)/lib +endif + +TEMPLATE_DIR ?= ../templates + +LIB += -ldl -L$(CUDA_LIB_DIR) -L$(CUDA_LIB_DIR)/stubs -lcuda -lcudadevrt -lcudart -lnvrtc + +HEADERS = jitify.hpp dataFactory.hpp jitFactory.hpp jitTestFactory.hpp semiringFactory.hpp \ + ../type_name.hpp + +TEMPLATES := $(wildcard $(TEMPLATE_DIR)/*.cu) + +CU_OBJS := ../GB_jit_cache.o ../GB_jit_launcher.o + +CFILES := $(wildcard ../*.c) + +COBJS := $(patsubst %.c, %.o, $(CFILES) ) + +JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES)) + +GTEST_LIB := googletest/build/lib/libgtest.a googletest/build/lib/libgtest_main.a + +%.cu.jit: %.cu + ../stringify $? > $@ + +stringify: stringify.cpp + $(GXX) -o $@ $< -O3 -Wall + +%.o: %.c + $(GXX) -c -o $@ $< $(CFLAGS) $(INC) + +%.o: %.cpp + $(GXX) -c -o $@ $< $(CXXFLAGS) $(INC) + +cu_link.o: $(CU_OBJS) + nvcc --gpu-architecture=compute_75 --device-link $(CU_OBJS) --output-file cu_link.o + + +testJit: testJit.cpp $(OBJS) $(HEADERS) $(JIT_TEMP) + $(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(OBJS) $(LIB) + +AxB_dot3_test_instances.hpp: testGen.py + python3 testGen.py + + +instances := AxB_dot3_test_instances.hpp + + +cudaTest: cudaTest.cpp $(COBJS) $(OBJS) $(HEADERS) $(JIT_TEMP) cu_link.o AxB_dot3_cuda_tests.hpp $(instances) + $(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(COBJS) $(CU_OBJS) cu_link.o $(LIB) $(GTEST_LIB) + +%.cu: %.cutmp + cp $? $@ + + +doc: jitify.hpp Doxyfile + $(DOXYGEN) Doxyfile +.PHONY: doc + + diff --git a/GraphBLAS/CUDA/test/cudaTest.cpp b/GraphBLAS/CUDA/test/cudaTest.cpp new file mode 100644 index 0000000000..e635ae39ab --- /dev/null +++ b/GraphBLAS/CUDA/test/cudaTest.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Extended example for building on-the-fly kernels with C interface. + Simple examples demonstrating different ways to load source code + and call kernels. + */ + + +#include "AxB_dot3_cuda_tests.hpp" +#include "gtest/gtest.h" + + +//int main(int argc, char* argv[]) { +#if __cplusplus >= 201103L + +//#define TEST_RESULT(result) (result ? "PASSED" : "FAILED") +//std::cout << "Running tests..."<(256, 32,120,"PLUS_TIMES"); + EXPECT_EQ( true, test_spdot_plus_times_ffd_nu); +} + +TEST(MergePathDot, PlusTimesffdLarge) { + bool test_spdot_plus_times_ffd_lrg_nu = test_spdotfactoryUM(4096, 256,256,"PLUS_TIMES"); + EXPECT_EQ(true, test_spdot_plus_times_ffd_lrg_nu); +} + +TEST(MergePathDot, PlusTimesfff) { + bool test_spdot_plus_times_fff = test_spdotfactoryUM(256, 32,32,"PLUS_TIMES"); + EXPECT_EQ(true, test_spdot_plus_times_fff); +} + +TEST(MergePathDot, PlusTimeffdTiny) { + bool test_spdot_plus_times_ffd = test_spdotfactoryUM(256, 32,32,"PLUS_TIMES"); + EXPECT_EQ(true, test_spdot_plus_times_ffd); +} + +TEST(VSVSDot, PlusTimesfff) { + bool test_spdot_batch_fff = test_spdot_batch_factoryUM(5, 32, 128, 128, "PLUS_TIMES"); + EXPECT_EQ( true, test_spdot_batch_fff); +} + +TEST(VSVSDot, PlusTimesiii) { + bool test_spdot_batch_iii = test_spdot_batch_factoryUM(5, 32, 128, 128, "PLUS_TIMES"); + EXPECT_EQ( true, test_spdot_batch_iii); +} + + + +// bool test_spdot_batch_fff = test_spdot_batch_factoryUM(5, 32, 128, 128, "PLUS_TIMES"); + + cudaSetDevice(0); + cudaDeviceReset(); + bool test_spdot_batch_iii = test_spdot_batch_factoryUM(5, 32, 128, 128, "PLUS_TIMES"); + std::cout << "test_spdot_batchUM uncached: " + << TEST_RESULT(test_spdot_batch_iii) + << std::endl; + + cudaSetDevice(1); + cudaDeviceReset(); + + bool test_spdot_batch_iii2= test_spdot_batch_factoryUM(5, 32, 256, 128, "PLUS_TIMES"); + std::cout << "test_spdot_batchUM uncached: " + << TEST_RESULT(test_spdot_batch_iii2) + << std::endl; + + + + + + bool test_dot_min_plus_iil = test_dotfactoryUM(4096,"MIN_PLUS"); + std::cout << "test_dotfactoryUM uncached: " + << TEST_RESULT(test_dot_min_plus_iil) + << std::endl; + + bool test_dot_min_plus_ffd = test_dotfactoryUM(4096,"MIN_PLUS"); + std::cout << "test_dotfactoryUM uncached: " + << TEST_RESULT(test_dot_min_plus_ffd) + << std::endl; + + bool test_dot_plus_times_ffd = test_dotfactoryUM(4096,"PLUS_TIMES"); + std::cout << "test_dotfactoryUM uncached: " + << TEST_RESULT(test_dot_plus_times_ffd) + << std::endl; + + bool test_dot_plus_times_fii = test_dotfactoryUM(4096,"PLUS_TIMES"); + std::cout << "test_dotfactoryUM uncached: " + << TEST_RESULT(test_dot_plus_times_fii) + << std::endl; + + bool test_dot_plus_times_iil = test_dotfactoryUM(4096,"PLUS_TIMES"); + std::cout << "test_dotfactoryUM uncached: " + << TEST_RESULT(test_dot_plus_times_iil) + << std::endl; + + bool test_reducefactory_float_result = test_reducefactoryUM(4096, "PLUS"); + std::cout << "test_reducefactoryUM uncached: " + << TEST_RESULT(test_reducefactory_float_result) + << std::endl; + + bool test_reducefactory_double_plus_result = test_reducefactoryUM(4096, "PLUS"); + std::cout << "test_reducefactoryUM uncached: " + << TEST_RESULT(test_reducefactory_double_plus_result) + << std::endl; + + std::cout << "testing cached kernel" <(4096, "PLUS"); + std::cout << "test_reducefactoryUM cached: " + << TEST_RESULT(test2_reducefactory_double_plus_result) + << std::endl; + + bool test_reducefactory_float_min_result = test_reducefactoryUM(32,"MIN"); + std::cout << "test_reducefactoryUM MIN uncached: " + << TEST_RESULT(test_reducefactory_float_min_result) + << std::endl; + + bool test_reducefactory_int_min_result = test_reducefactoryUM(32,"MIN"); + std::cout << "test_reducefactoryUM MIN uncached: " + << TEST_RESULT(test_reducefactory_int_min_result) + << std::endl; + + bool test_reducefactory_int_max_result = test_reducefactoryUM(32,"MAX"); + std::cout << "test_reducefactoryUM MAX uncached: " + << TEST_RESULT(test_reducefactory_int_max_result) + << std::endl; + + bool test_reducefactory_int_result = test_reducefactoryUM(4096,"PLUS"); + std::cout << "test_reducefactoryUM PLUS uncached: " + << TEST_RESULT(test_reducefactory_int_result) + << std::endl; + + bool test_reducefactory_int_cache_result = + test_reducefactoryUM(4096,"PLUS"); + std::cout << "test_reducefactoryUM PLUS cached: " + << TEST_RESULT(test_reducefactory_int_cache_result) + << std::endl; +*/ +#endif diff --git a/GraphBLAS/CUDA/test/dataFactory.hpp b/GraphBLAS/CUDA/test/dataFactory.hpp new file mode 100644 index 0000000000..e2a345d43b --- /dev/null +++ b/GraphBLAS/CUDA/test/dataFactory.hpp @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#define CHECK_CUDA(call) checkCudaErrors( call ) + +//Vector generators +template +void fillvector_linear( int N, T *vec) { + for (int i = 0; i< N; ++i) vec[i] = T(i); +} +template +void fillvector_constant( int N, T *vec, T val) { + for (int i = 0; i< N; ++i) vec[i] = val; +} + +// Mix-in class to enable unified memory +class Managed { +public: + void *operator new(size_t len) { + void *ptr = nullptr; + //std::cout<<"in new operator, alloc for "< +class matrix : public Managed { + public: + uint64_t zombie_count = 0; + int64_t vlen; + int64_t vdim; + int64_t nnz; + int64_t *p = nullptr; + int64_t *h = nullptr; + int64_t *i = nullptr; + T *x = nullptr; + bool is_filled = false; + + matrix(){}; + + matrix( int64_t N, int64_t nvecs){ + vlen = N; + vdim = nvecs; + } + + void set_zombie_count( uint64_t zc) { zombie_count = zc;} + uint64_t get_zombie_count() { return zombie_count;} + void add_zombie_count( int nz) { zombie_count += nz;} + + void clear() { + if ( p != nullptr){ cudaFree(p); p = nullptr; } + if ( h != nullptr){ cudaFree(h); h = nullptr; } + if ( i != nullptr){ cudaFree(i); i = nullptr; } + if ( x != nullptr){ cudaFree(x); x = nullptr; } + is_filled = false; + vlen = 0; + vdim = 0; + nnz = 0; + zombie_count = 0; + } + + void alloc( int64_t N, int64_t Nz) { + + //cudaMallocManaged((void**)&p, (Nz+N+1)*sizeof(int64_t)+ (Nz*sizeof(T))); + //i = p+(N+1); + //x = (T*)(p + (Nz+N+1)); + CHECK_CUDA( cudaMallocManaged((void**)&p, (N+1)*sizeof(int64_t)) ); + CHECK_CUDA( cudaMallocManaged((void**)&i, Nz*sizeof(int64_t)) ); + CHECK_CUDA( cudaMallocManaged((void**)&x, Nz*sizeof(T)) ); + + } + + void fill_random( int64_t N, int64_t Nz, std::mt19937 r) { + + int64_t inv_sparsity = (N*N)/Nz; //= values not taken per value occupied in index space + + //std::cout<< "fill_random N="<< N<<" need "<< Nz<<" values, invsparse = "< +class SpGEMM_problem_generator { + + float Anzpercent,Bnzpercent,Cnzpercent; + int64_t Cnz; + int64_t *Bucket = nullptr; + int64_t BucketStart[13]; + unsigned seed = 13372801; + std::mt19937 r; //random number generator Mersenne Twister + bool ready = false; + + public: + + matrix *C= nullptr; + matrix *M= nullptr; + matrix *A= nullptr; + matrix *B= nullptr; + + SpGEMM_problem_generator() { + + //std::cout<<"creating matrices"<; + // CHECK_CUDA( cudaMallocManaged( (void**)&C, sizeof(matrix)) ); + //cudaMemAdvise ( C, sizeof(matrix), cudaMemAdviseSetReadMostly, 1); + //std::cout<<"created C matrix"<; + //cudaMallocManaged( (void**)&M, sizeof(matrix)); + //cudaMemAdvise ( M, sizeof(matrix), cudaMemAdviseSetReadOnly, 1); + //std::cout<<"created M matrix"<; + //cudaMallocManaged( (void**)&A, sizeof(matrix)); + //cudaMemAdvise ( C, sizeof(matrix), cudaMemAdviseSetReadOnly, 1); + //std::cout<<"created A matrix"<; + //cudaMallocManaged( (void**)&B, sizeof(matrix)); + //cudaMemAdvise ( C, sizeof(matrix), cudaMemAdviseSetReadOnly, 1); + //std::cout<<"created B matrix"<* getCptr(){ return C;} + matrix* getMptr(){ return M;} + matrix* getAptr(){ return A;} + matrix* getBptr(){ return B;} + + int64_t* getBucket() { return Bucket;} + int64_t* getBucketStart(){ return BucketStart;} + + void loadCj() { + + // Load C_i with column j info to avoid another lookup + for (int c = 0 ; c< M->vdim; ++c) { + for ( int r = M->p[c]; r< M->p[c+1]; ++r){ + C->i[r] = c << 4 ; //shift to store bucket info + } + } + + } + + void init( int64_t N , int64_t Anz, int64_t Bnz, float Cnzpercent){ + + // Get sizes relative to fully dense matrices + Anzpercent = float(Anz)/float(N*N); + Bnzpercent = float(Bnz)/float(N*N); + Cnzpercent = Cnzpercent; + Cnz = (int64_t)(Cnzpercent * N * N); + std::cout<<"Anz% ="<fill_random( N, Cnz, r); + M->fill_random( N, Cnz, r); + A->fill_random( N, Anz, r); + B->fill_random( N, Bnz, r); + + std::cout<<"fill complete"<p = M->p; //same column pointers (assuming CSC here) + + loadCj(); + + } + + void del(){ + C->clear(); + M->clear(); + A->clear(); + B->clear(); + if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) ); + delete C; + delete M; + delete A; + delete B; + CHECK_CUDA( cudaDeviceSynchronize() ); + } + + void fill_buckets( int fill_bucket){ + + std::cout< fill_bucket) BucketStart[b] = Cnz; + //std::cout<< " one bucket "<< b<<"starts at "<" + "$/${CMAKE_INSTALL_INCLUDEDIR}>") + target_include_directories(gmock_main SYSTEM INTERFACE + "$" + "$/${CMAKE_INSTALL_INCLUDEDIR}>") +endif() + +######################################################################## +# +# Install rules +install_project(gmock gmock_main) + +######################################################################## +# +# Google Mock's own tests. +# +# You can skip this section if you aren't interested in testing +# Google Mock itself. +# +# The tests are not built by default. To build them, set the +# gmock_build_tests option to ON. You can do it by running ccmake +# or specifying the -Dgmock_build_tests=ON flag when running cmake. + +if (gmock_build_tests) + # This must be set in the root directory for the tests to be run by + # 'make test' or ctest. + enable_testing() + + if (WIN32) + file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/$/RunTest.ps1" + CONTENT +"$project_bin = \"${CMAKE_BINARY_DIR}/bin/$\" +$env:Path = \"$project_bin;$env:Path\" +& $args") + elseif (MINGW OR CYGWIN) + file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1" + CONTENT +"$project_bin = (cygpath --windows ${CMAKE_BINARY_DIR}/bin) +$env:Path = \"$project_bin;$env:Path\" +& $args") + endif() + + if (MINGW OR CYGWIN) + if (CMAKE_VERSION VERSION_LESS "2.8.12") + add_compile_options("-Wa,-mbig-obj") + else() + add_definitions("-Wa,-mbig-obj") + endif() + endif() + + ############################################################ + # C++ tests built with standard compiler flags. + + cxx_test(gmock-actions_test gmock_main) + cxx_test(gmock-cardinalities_test gmock_main) + cxx_test(gmock_ex_test gmock_main) + cxx_test(gmock-function-mocker_test gmock_main) + cxx_test(gmock-generated-actions_test gmock_main) + cxx_test(gmock-generated-matchers_test gmock_main) + cxx_test(gmock-internal-utils_test gmock_main) + cxx_test(gmock-matchers_test gmock_main) + cxx_test(gmock-more-actions_test gmock_main) + cxx_test(gmock-nice-strict_test gmock_main) + cxx_test(gmock-port_test gmock_main) + cxx_test(gmock-spec-builders_test gmock_main) + cxx_test(gmock_link_test gmock_main test/gmock_link2_test.cc) + cxx_test(gmock_test gmock_main) + + if (DEFINED GTEST_HAS_PTHREAD) + cxx_test(gmock_stress_test gmock) + endif() + + # gmock_all_test is commented to save time building and running tests. + # Uncomment if necessary. + # cxx_test(gmock_all_test gmock_main) + + ############################################################ + # C++ tests built with non-standard compiler flags. + + if (MSVC) + cxx_library(gmock_main_no_exception "${cxx_no_exception}" + "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc) + + cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" + "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc) + + else() + cxx_library(gmock_main_no_exception "${cxx_no_exception}" src/gmock_main.cc) + target_link_libraries(gmock_main_no_exception PUBLIC gmock) + + cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" src/gmock_main.cc) + target_link_libraries(gmock_main_no_rtti PUBLIC gmock) + endif() + cxx_test_with_flags(gmock-more-actions_no_exception_test "${cxx_no_exception}" + gmock_main_no_exception test/gmock-more-actions_test.cc) + + cxx_test_with_flags(gmock_no_rtti_test "${cxx_no_rtti}" + gmock_main_no_rtti test/gmock-spec-builders_test.cc) + + cxx_shared_library(shared_gmock_main "${cxx_default}" + "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc) + + # Tests that a binary can be built with Google Mock as a shared library. On + # some system configurations, it may not possible to run the binary without + # knowing more details about the system configurations. We do not try to run + # this binary. To get a more robust shared library coverage, configure with + # -DBUILD_SHARED_LIBS=ON. + cxx_executable_with_flags(shared_gmock_test_ "${cxx_default}" + shared_gmock_main test/gmock-spec-builders_test.cc) + set_target_properties(shared_gmock_test_ + PROPERTIES + COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1") + + ############################################################ + # Python tests. + + cxx_executable(gmock_leak_test_ test gmock_main) + py_test(gmock_leak_test) + + cxx_executable(gmock_output_test_ test gmock) + py_test(gmock_output_test) +endif() diff --git a/GraphBLAS/CUDA/test/googlemock/CONTRIBUTORS b/GraphBLAS/CUDA/test/googlemock/CONTRIBUTORS new file mode 100644 index 0000000000..6e9ae362b6 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/CONTRIBUTORS @@ -0,0 +1,40 @@ +# This file contains a list of people who've made non-trivial +# contribution to the Google C++ Mocking Framework project. People +# who commit code to the project are encouraged to add their names +# here. Please keep the list sorted by first names. + +Benoit Sigoure +Bogdan Piloca +Chandler Carruth +Dave MacLachlan +David Anderson +Dean Sturtevant +Gene Volovich +Hal Burch +Jeffrey Yasskin +Jim Keller +Joe Walnes +Jon Wray +Keir Mierle +Keith Ray +Kostya Serebryany +Lev Makhlis +Manuel Klimek +Mario Tanev +Mark Paskin +Markus Heule +Matthew Simmons +Mike Bland +Neal Norwitz +Nermin Ozkiranartli +Owen Carlsen +Paneendra Ba +Paul Menage +Piotr Kaminski +Russ Rufer +Sverre Sundsdal +Takeshi Yoshino +Vadim Berman +Vlad Losev +Wolfgang Klier +Zhanyong Wan diff --git a/GraphBLAS/CUDA/test/googlemock/LICENSE b/GraphBLAS/CUDA/test/googlemock/LICENSE new file mode 100644 index 0000000000..1941a11f8c --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/LICENSE @@ -0,0 +1,28 @@ +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/GraphBLAS/CUDA/test/googlemock/README.md b/GraphBLAS/CUDA/test/googlemock/README.md new file mode 100644 index 0000000000..183fdb81d9 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/README.md @@ -0,0 +1,44 @@ +# Googletest Mocking (gMock) Framework + +### Overview + +Google's framework for writing and using C++ mock classes. It can help you +derive better designs of your system and write better tests. + +It is inspired by: + +* [jMock](http://www.jmock.org/), +* [EasyMock](http://www.easymock.org/), and +* [Hamcrest](http://code.google.com/p/hamcrest/), + +and designed with C++'s specifics in mind. + +gMock: + +- provides a declarative syntax for defining mocks, +- can define partial (hybrid) mocks, which are a cross of real and mock + objects, +- handles functions of arbitrary types and overloaded functions, +- comes with a rich set of matchers for validating function arguments, +- uses an intuitive syntax for controlling the behavior of a mock, +- does automatic verification of expectations (no record-and-replay needed), +- allows arbitrary (partial) ordering constraints on function calls to be + expressed, +- lets a user extend it by defining new matchers and actions. +- does not use exceptions, and +- is easy to learn and use. + +Details and examples can be found here: + +* [gMock for Dummies](docs/for_dummies.md) +* [Legacy gMock FAQ](docs/gmock_faq.md) +* [gMock Cookbook](docs/cook_book.md) +* [gMock Cheat Sheet](docs/cheat_sheet.md) + +Please note that code under scripts/generator/ is from the [cppclean +project](http://code.google.com/p/cppclean/) and under the Apache +License, which is different from Google Mock's license. + +Google Mock is a part of +[Google Test C++ testing framework](http://github.com/google/googletest/) and a +subject to the same requirements. diff --git a/GraphBLAS/CUDA/test/googlemock/cmake/gmock.pc.in b/GraphBLAS/CUDA/test/googlemock/cmake/gmock.pc.in new file mode 100644 index 0000000000..5780fcaa53 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/cmake/gmock.pc.in @@ -0,0 +1,10 @@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ + +Name: gmock +Description: GoogleMock (without main() function) +Version: @PROJECT_VERSION@ +URL: https://github.com/google/googletest +Requires: gtest +Libs: -L${libdir} -lgmock @CMAKE_THREAD_LIBS_INIT@ +Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ diff --git a/GraphBLAS/CUDA/test/googlemock/cmake/gmock_main.pc.in b/GraphBLAS/CUDA/test/googlemock/cmake/gmock_main.pc.in new file mode 100644 index 0000000000..f2dfe69e0f --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/cmake/gmock_main.pc.in @@ -0,0 +1,10 @@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ + +Name: gmock_main +Description: GoogleMock (with main() function) +Version: @PROJECT_VERSION@ +URL: https://github.com/google/googletest +Requires: gmock +Libs: -L${libdir} -lgmock_main @CMAKE_THREAD_LIBS_INIT@ +Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ diff --git a/GraphBLAS/CUDA/test/googlemock/docs/cheat_sheet.md b/GraphBLAS/CUDA/test/googlemock/docs/cheat_sheet.md new file mode 100644 index 0000000000..1e0541ba81 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/docs/cheat_sheet.md @@ -0,0 +1,776 @@ +## gMock Cheat Sheet + + + + + +### Defining a Mock Class + +#### Mocking a Normal Class {#MockClass} + +Given + +```cpp +class Foo { + ... + virtual ~Foo(); + virtual int GetSize() const = 0; + virtual string Describe(const char* name) = 0; + virtual string Describe(int type) = 0; + virtual bool Process(Bar elem, int count) = 0; +}; +``` + +(note that `~Foo()` **must** be virtual) we can define its mock as + +```cpp +#include "gmock/gmock.h" + +class MockFoo : public Foo { + ... + MOCK_METHOD(int, GetSize, (), (const, override)); + MOCK_METHOD(string, Describe, (const char* name), (override)); + MOCK_METHOD(string, Describe, (int type), (override)); + MOCK_METHOD(bool, Process, (Bar elem, int count), (override)); +}; +``` + +To create a "nice" mock, which ignores all uninteresting calls, a "naggy" mock, +which warns on all uninteresting calls, or a "strict" mock, which treats them as +failures: + +```cpp +using ::testing::NiceMock; +using ::testing::NaggyMock; +using ::testing::StrictMock; + +NiceMock nice_foo; // The type is a subclass of MockFoo. +NaggyMock naggy_foo; // The type is a subclass of MockFoo. +StrictMock strict_foo; // The type is a subclass of MockFoo. +``` + +**Note:** A mock object is currently naggy by default. We may make it nice by +default in the future. + +#### Mocking a Class Template {#MockTemplate} + +Class templates can be mocked just like any class. + +To mock + +```cpp +template +class StackInterface { + ... + virtual ~StackInterface(); + virtual int GetSize() const = 0; + virtual void Push(const Elem& x) = 0; +}; +``` + +(note that all member functions that are mocked, including `~StackInterface()` +**must** be virtual). + +```cpp +template +class MockStack : public StackInterface { + ... + MOCK_METHOD(int, GetSize, (), (const, override)); + MOCK_METHOD(void, Push, (const Elem& x), (override)); +}; +``` + +#### Specifying Calling Conventions for Mock Functions + +If your mock function doesn't use the default calling convention, you can +specify it by adding `Calltype(convention)` to `MOCK_METHOD`'s 4th parameter. +For example, + +```cpp + MOCK_METHOD(bool, Foo, (int n), (Calltype(STDMETHODCALLTYPE))); + MOCK_METHOD(int, Bar, (double x, double y), + (const, Calltype(STDMETHODCALLTYPE))); +``` + +where `STDMETHODCALLTYPE` is defined by `` on Windows. + +### Using Mocks in Tests {#UsingMocks} + +The typical work flow is: + +1. Import the gMock names you need to use. All gMock symbols are in the + `testing` namespace unless they are macros or otherwise noted. +2. Create the mock objects. +3. Optionally, set the default actions of the mock objects. +4. Set your expectations on the mock objects (How will they be called? What + will they do?). +5. Exercise code that uses the mock objects; if necessary, check the result + using googletest assertions. +6. When a mock object is destructed, gMock automatically verifies that all + expectations on it have been satisfied. + +Here's an example: + +```cpp +using ::testing::Return; // #1 + +TEST(BarTest, DoesThis) { + MockFoo foo; // #2 + + ON_CALL(foo, GetSize()) // #3 + .WillByDefault(Return(1)); + // ... other default actions ... + + EXPECT_CALL(foo, Describe(5)) // #4 + .Times(3) + .WillRepeatedly(Return("Category 5")); + // ... other expectations ... + + EXPECT_EQ("good", MyProductionFunction(&foo)); // #5 +} // #6 +``` + +### Setting Default Actions {#OnCall} + +gMock has a **built-in default action** for any function that returns `void`, +`bool`, a numeric value, or a pointer. In C++11, it will additionally returns +the default-constructed value, if one exists for the given type. + +To customize the default action for functions with return type *`T`*: + +```cpp +using ::testing::DefaultValue; + +// Sets the default value to be returned. T must be CopyConstructible. +DefaultValue::Set(value); +// Sets a factory. Will be invoked on demand. T must be MoveConstructible. +// T MakeT(); +DefaultValue::SetFactory(&MakeT); +// ... use the mocks ... +// Resets the default value. +DefaultValue::Clear(); +``` + +Example usage: + +```cpp + // Sets the default action for return type std::unique_ptr to + // creating a new Buzz every time. + DefaultValue>::SetFactory( + [] { return MakeUnique(AccessLevel::kInternal); }); + + // When this fires, the default action of MakeBuzz() will run, which + // will return a new Buzz object. + EXPECT_CALL(mock_buzzer_, MakeBuzz("hello")).Times(AnyNumber()); + + auto buzz1 = mock_buzzer_.MakeBuzz("hello"); + auto buzz2 = mock_buzzer_.MakeBuzz("hello"); + EXPECT_NE(nullptr, buzz1); + EXPECT_NE(nullptr, buzz2); + EXPECT_NE(buzz1, buzz2); + + // Resets the default action for return type std::unique_ptr, + // to avoid interfere with other tests. + DefaultValue>::Clear(); +``` + +To customize the default action for a particular method of a specific mock +object, use `ON_CALL()`. `ON_CALL()` has a similar syntax to `EXPECT_CALL()`, +but it is used for setting default behaviors (when you do not require that the +mock method is called). See [here](cook_book.md#UseOnCall) for a more detailed +discussion. + +```cpp +ON_CALL(mock-object, method(matchers)) + .With(multi-argument-matcher) ? + .WillByDefault(action); +``` + +### Setting Expectations {#ExpectCall} + +`EXPECT_CALL()` sets **expectations** on a mock method (How will it be called? +What will it do?): + +```cpp +EXPECT_CALL(mock-object, method (matchers)?) + .With(multi-argument-matcher) ? + .Times(cardinality) ? + .InSequence(sequences) * + .After(expectations) * + .WillOnce(action) * + .WillRepeatedly(action) ? + .RetiresOnSaturation(); ? +``` + +For each item above, `?` means it can be used at most once, while `*` means it +can be used any number of times. + +In order to pass, `EXPECT_CALL` must be used before the calls are actually made. + +The `(matchers)` is a comma-separated list of matchers that correspond to each +of the arguments of `method`, and sets the expectation only for calls of +`method` that matches all of the matchers. + +If `(matchers)` is omitted, the expectation is the same as if the matchers were +set to anything matchers (for example, `(_, _, _, _)` for a four-arg method). + +If `Times()` is omitted, the cardinality is assumed to be: + +* `Times(1)` when there is neither `WillOnce()` nor `WillRepeatedly()`; +* `Times(n)` when there are `n` `WillOnce()`s but no `WillRepeatedly()`, where + `n` >= 1; or +* `Times(AtLeast(n))` when there are `n` `WillOnce()`s and a + `WillRepeatedly()`, where `n` >= 0. + +A method with no `EXPECT_CALL()` is free to be invoked *any number of times*, +and the default action will be taken each time. + +### Matchers {#MatcherList} + + + +A **matcher** matches a *single* argument. You can use it inside `ON_CALL()` or +`EXPECT_CALL()`, or use it to validate a value directly using two macros: + + +| Macro | Description | +| :----------------------------------- | :------------------------------------ | +| `EXPECT_THAT(actual_value, matcher)` | Asserts that `actual_value` matches `matcher`. | +| `ASSERT_THAT(actual_value, matcher)` | The same as `EXPECT_THAT(actual_value, matcher)`, except that it generates a **fatal** failure. | + + +Built-in matchers (where `argument` is the function argument, e.g. +`actual_value` in the example above, or when used in the context of +`EXPECT_CALL(mock_object, method(matchers))`, the arguments of `method`) are +divided into several categories: + +#### Wildcard + +Matcher | Description +:-------------------------- | :----------------------------------------------- +`_` | `argument` can be any value of the correct type. +`A()` or `An()` | `argument` can be any value of type `type`. + +#### Generic Comparison + + +| Matcher | Description | +| :--------------------- | :-------------------------------------------------- | +| `Eq(value)` or `value` | `argument == value` | +| `Ge(value)` | `argument >= value` | +| `Gt(value)` | `argument > value` | +| `Le(value)` | `argument <= value` | +| `Lt(value)` | `argument < value` | +| `Ne(value)` | `argument != value` | +| `IsFalse()` | `argument` evaluates to `false` in a Boolean context. | +| `IsTrue()` | `argument` evaluates to `true` in a Boolean context. | +| `IsNull()` | `argument` is a `NULL` pointer (raw or smart). | +| `NotNull()` | `argument` is a non-null pointer (raw or smart). | +| `Optional(m)` | `argument` is `optional<>` that contains a value matching `m`. (For testing whether an `optional<>` is set, check for equality with `nullopt`. You may need to use `Eq(nullopt)` if the inner type doesn't have `==`.)| +| `VariantWith(m)` | `argument` is `variant<>` that holds the alternative of type T with a value matching `m`. | +| `Ref(variable)` | `argument` is a reference to `variable`. | +| `TypedEq(value)` | `argument` has type `type` and is equal to `value`. You may need to use this instead of `Eq(value)` when the mock function is overloaded. | + + +Except `Ref()`, these matchers make a *copy* of `value` in case it's modified or +destructed later. If the compiler complains that `value` doesn't have a public +copy constructor, try wrap it in `ByRef()`, e.g. +`Eq(ByRef(non_copyable_value))`. If you do that, make sure `non_copyable_value` +is not changed afterwards, or the meaning of your matcher will be changed. + +`IsTrue` and `IsFalse` are useful when you need to use a matcher, or for types +that can be explicitly converted to Boolean, but are not implicitly converted to +Boolean. In other cases, you can use the basic +[`EXPECT_TRUE` and `EXPECT_FALSE`](../../googletest/docs/primer#basic-assertions) +assertions. + +#### Floating-Point Matchers {#FpMatchers} + + +| Matcher | Description | +| :------------------------------- | :--------------------------------- | +| `DoubleEq(a_double)` | `argument` is a `double` value approximately equal to `a_double`, treating two NaNs as unequal. | +| `FloatEq(a_float)` | `argument` is a `float` value approximately equal to `a_float`, treating two NaNs as unequal. | +| `NanSensitiveDoubleEq(a_double)` | `argument` is a `double` value approximately equal to `a_double`, treating two NaNs as equal. | +| `NanSensitiveFloatEq(a_float)` | `argument` is a `float` value approximately equal to `a_float`, treating two NaNs as equal. | +| `IsNan()` | `argument` is any floating-point type with a NaN value. | + + +The above matchers use ULP-based comparison (the same as used in googletest). +They automatically pick a reasonable error bound based on the absolute value of +the expected value. `DoubleEq()` and `FloatEq()` conform to the IEEE standard, +which requires comparing two NaNs for equality to return false. The +`NanSensitive*` version instead treats two NaNs as equal, which is often what a +user wants. + + +| Matcher | Description | +| :------------------------------------------------ | :----------------------- | +| `DoubleNear(a_double, max_abs_error)` | `argument` is a `double` value close to `a_double` (absolute error <= `max_abs_error`), treating two NaNs as unequal. | +| `FloatNear(a_float, max_abs_error)` | `argument` is a `float` value close to `a_float` (absolute error <= `max_abs_error`), treating two NaNs as unequal. | +| `NanSensitiveDoubleNear(a_double, max_abs_error)` | `argument` is a `double` value close to `a_double` (absolute error <= `max_abs_error`), treating two NaNs as equal. | +| `NanSensitiveFloatNear(a_float, max_abs_error)` | `argument` is a `float` value close to `a_float` (absolute error <= `max_abs_error`), treating two NaNs as equal. | + + +#### String Matchers + +The `argument` can be either a C string or a C++ string object: + + +| Matcher | Description | +| :---------------------- | :------------------------------------------------- | +| `ContainsRegex(string)` | `argument` matches the given regular expression. | +| `EndsWith(suffix)` | `argument` ends with string `suffix`. | +| `HasSubstr(string)` | `argument` contains `string` as a sub-string. | +| `MatchesRegex(string)` | `argument` matches the given regular expression with the match starting at the first character and ending at the last character. | +| `StartsWith(prefix)` | `argument` starts with string `prefix`. | +| `StrCaseEq(string)` | `argument` is equal to `string`, ignoring case. | +| `StrCaseNe(string)` | `argument` is not equal to `string`, ignoring case. | +| `StrEq(string)` | `argument` is equal to `string`. | +| `StrNe(string)` | `argument` is not equal to `string`. | + + +`ContainsRegex()` and `MatchesRegex()` take ownership of the `RE` object. They +use the regular expression syntax defined +[here](../../googletest/docs/advanced.md#regular-expression-syntax). All of +these matchers, except `ContainsRegex()` and `MatchesRegex()` work for wide +strings as well. + +#### Container Matchers + +Most STL-style containers support `==`, so you can use `Eq(expected_container)` +or simply `expected_container` to match a container exactly. If you want to +write the elements in-line, match them more flexibly, or get more informative +messages, you can use: + + +| Matcher | Description | +| :---------------------------------------- | :------------------------------- | +| `BeginEndDistanceIs(m)` | `argument` is a container whose `begin()` and `end()` iterators are separated by a number of increments matching `m`. E.g. `BeginEndDistanceIs(2)` or `BeginEndDistanceIs(Lt(2))`. For containers that define a `size()` method, `SizeIs(m)` may be more efficient. | +| `ContainerEq(container)` | The same as `Eq(container)` except that the failure message also includes which elements are in one container but not the other. | +| `Contains(e)` | `argument` contains an element that matches `e`, which can be either a value or a matcher. | +| `Each(e)` | `argument` is a container where *every* element matches `e`, which can be either a value or a matcher. | +| `ElementsAre(e0, e1, ..., en)` | `argument` has `n + 1` elements, where the *i*-th element matches `ei`, which can be a value or a matcher. | +| `ElementsAreArray({e0, e1, ..., en})`, `ElementsAreArray(a_container)`, `ElementsAreArray(begin, end)`, `ElementsAreArray(array)`, or `ElementsAreArray(array, count)` | The same as `ElementsAre()` except that the expected element values/matchers come from an initializer list, STL-style container, iterator range, or C-style array. | +| `IsEmpty()` | `argument` is an empty container (`container.empty()`). | +| `IsSubsetOf({e0, e1, ..., en})`, `IsSubsetOf(a_container)`, `IsSubsetOf(begin, end)`, `IsSubsetOf(array)`, or `IsSubsetOf(array, count)` | `argument` matches `UnorderedElementsAre(x0, x1, ..., xk)` for some subset `{x0, x1, ..., xk}` of the expected matchers. | +| `IsSupersetOf({e0, e1, ..., en})`, `IsSupersetOf(a_container)`, `IsSupersetOf(begin, end)`, `IsSupersetOf(array)`, or `IsSupersetOf(array, count)` | Some subset of `argument` matches `UnorderedElementsAre(`expected matchers`)`. | +| `Pointwise(m, container)`, `Pointwise(m, {e0, e1, ..., en})` | `argument` contains the same number of elements as in `container`, and for all i, (the i-th element in `argument`, the i-th element in `container`) match `m`, which is a matcher on 2-tuples. E.g. `Pointwise(Le(), upper_bounds)` verifies that each element in `argument` doesn't exceed the corresponding element in `upper_bounds`. See more detail below. | +| `SizeIs(m)` | `argument` is a container whose size matches `m`. E.g. `SizeIs(2)` or `SizeIs(Lt(2))`. | +| `UnorderedElementsAre(e0, e1, ..., en)` | `argument` has `n + 1` elements, and under *some* permutation of the elements, each element matches an `ei` (for a different `i`), which can be a value or a matcher. | +| `UnorderedElementsAreArray({e0, e1, ..., en})`, `UnorderedElementsAreArray(a_container)`, `UnorderedElementsAreArray(begin, end)`, `UnorderedElementsAreArray(array)`, or `UnorderedElementsAreArray(array, count)` | The same as `UnorderedElementsAre()` except that the expected element values/matchers come from an initializer list, STL-style container, iterator range, or C-style array. | +| `UnorderedPointwise(m, container)`, `UnorderedPointwise(m, {e0, e1, ..., en})` | Like `Pointwise(m, container)`, but ignores the order of elements. | +| `WhenSorted(m)` | When `argument` is sorted using the `<` operator, it matches container matcher `m`. E.g. `WhenSorted(ElementsAre(1, 2, 3))` verifies that `argument` contains elements 1, 2, and 3, ignoring order. | +| `WhenSortedBy(comparator, m)` | The same as `WhenSorted(m)`, except that the given comparator instead of `<` is used to sort `argument`. E.g. `WhenSortedBy(std::greater(), ElementsAre(3, 2, 1))`. | + + +**Notes:** + +* These matchers can also match: + 1. a native array passed by reference (e.g. in `Foo(const int (&a)[5])`), + and + 2. an array passed as a pointer and a count (e.g. in `Bar(const T* buffer, + int len)` -- see [Multi-argument Matchers](#MultiArgMatchers)). +* The array being matched may be multi-dimensional (i.e. its elements can be + arrays). +* `m` in `Pointwise(m, ...)` should be a matcher for `::std::tuple` + where `T` and `U` are the element type of the actual container and the + expected container, respectively. For example, to compare two `Foo` + containers where `Foo` doesn't support `operator==`, one might write: + + ```cpp + using ::std::get; + MATCHER(FooEq, "") { + return std::get<0>(arg).Equals(std::get<1>(arg)); + } + ... + EXPECT_THAT(actual_foos, Pointwise(FooEq(), expected_foos)); + ``` + +#### Member Matchers + + +| Matcher | Description | +| :------------------------------ | :----------------------------------------- | +| `Field(&class::field, m)` | `argument.field` (or `argument->field` when `argument` is a plain pointer) matches matcher `m`, where `argument` is an object of type _class_. | +| `Key(e)` | `argument.first` matches `e`, which can be either a value or a matcher. E.g. `Contains(Key(Le(5)))` can verify that a `map` contains a key `<= 5`. | +| `Pair(m1, m2)` | `argument` is an `std::pair` whose `first` field matches `m1` and `second` field matches `m2`. | +| `Property(&class::property, m)` | `argument.property()` (or `argument->property()` when `argument` is a plain pointer) matches matcher `m`, where `argument` is an object of type _class_. | + + +#### Matching the Result of a Function, Functor, or Callback + + +| Matcher | Description | +| :--------------- | :------------------------------------------------ | +| `ResultOf(f, m)` | `f(argument)` matches matcher `m`, where `f` is a function or functor. | + + +#### Pointer Matchers + + +| Matcher | Description | +| :------------------------ | :---------------------------------------------- | +| `Pointee(m)` | `argument` (either a smart pointer or a raw pointer) points to a value that matches matcher `m`. | +| `WhenDynamicCastTo(m)` | when `argument` is passed through `dynamic_cast()`, it matches matcher `m`. | + + + + + + +#### Multi-argument Matchers {#MultiArgMatchers} + +Technically, all matchers match a *single* value. A "multi-argument" matcher is +just one that matches a *tuple*. The following matchers can be used to match a +tuple `(x, y)`: + +Matcher | Description +:------ | :---------- +`Eq()` | `x == y` +`Ge()` | `x >= y` +`Gt()` | `x > y` +`Le()` | `x <= y` +`Lt()` | `x < y` +`Ne()` | `x != y` + +You can use the following selectors to pick a subset of the arguments (or +reorder them) to participate in the matching: + + +| Matcher | Description | +| :------------------------- | :---------------------------------------------- | +| `AllArgs(m)` | Equivalent to `m`. Useful as syntactic sugar in `.With(AllArgs(m))`. | +| `Args(m)` | The tuple of the `k` selected (using 0-based indices) arguments matches `m`, e.g. `Args<1, 2>(Eq())`. | + + +#### Composite Matchers + +You can make a matcher from one or more other matchers: + + +| Matcher | Description | +| :------------------------------- | :-------------------------------------- | +| `AllOf(m1, m2, ..., mn)` | `argument` matches all of the matchers `m1` to `mn`. | +| `AllOfArray({m0, m1, ..., mn})`, `AllOfArray(a_container)`, `AllOfArray(begin, end)`, `AllOfArray(array)`, or `AllOfArray(array, count)` | The same as `AllOf()` except that the matchers come from an initializer list, STL-style container, iterator range, or C-style array. | +| `AnyOf(m1, m2, ..., mn)` | `argument` matches at least one of the matchers `m1` to `mn`. | +| `AnyOfArray({m0, m1, ..., mn})`, `AnyOfArray(a_container)`, `AnyOfArray(begin, end)`, `AnyOfArray(array)`, or `AnyOfArray(array, count)` | The same as `AnyOf()` except that the matchers come from an initializer list, STL-style container, iterator range, or C-style array. | +| `Not(m)` | `argument` doesn't match matcher `m`. | + + + + +#### Adapters for Matchers + + +| Matcher | Description | +| :---------------------- | :------------------------------------ | +| `MatcherCast(m)` | casts matcher `m` to type `Matcher`. | +| `SafeMatcherCast(m)` | [safely casts](cook_book.md#casting-matchers) matcher `m` to type `Matcher`. | +| `Truly(predicate)` | `predicate(argument)` returns something considered by C++ to be true, where `predicate` is a function or functor. | + + +`AddressSatisfies(callback)` and `Truly(callback)` take ownership of `callback`, +which must be a permanent callback. + +#### Using Matchers as Predicates {#MatchersAsPredicatesCheat} + + +| Matcher | Description | +| :---------------------------- | :------------------------------------------ | +| `Matches(m)(value)` | evaluates to `true` if `value` matches `m`. You can use `Matches(m)` alone as a unary functor. | +| `ExplainMatchResult(m, value, result_listener)` | evaluates to `true` if `value` matches `m`, explaining the result to `result_listener`. | +| `Value(value, m)` | evaluates to `true` if `value` matches `m`. | + + +#### Defining Matchers + + +| Matcher | Description | +| :----------------------------------- | :------------------------------------ | +| `MATCHER(IsEven, "") { return (arg % 2) == 0; }` | Defines a matcher `IsEven()` to match an even number. | +| `MATCHER_P(IsDivisibleBy, n, "") { *result_listener << "where the remainder is " << (arg % n); return (arg % n) == 0; }` | Defines a matcher `IsDivisibleBy(n)` to match a number divisible by `n`. | +| `MATCHER_P2(IsBetween, a, b, std::string(negation ? "isn't" : "is") + " between " + PrintToString(a) + " and " + PrintToString(b)) { return a <= arg && arg <= b; }` | Defines a matcher `IsBetween(a, b)` to match a value in the range [`a`, `b`]. | + + +**Notes:** + +1. The `MATCHER*` macros cannot be used inside a function or class. +2. The matcher body must be *purely functional* (i.e. it cannot have any side + effect, and the result must not depend on anything other than the value + being matched and the matcher parameters). +3. You can use `PrintToString(x)` to convert a value `x` of any type to a + string. + +### Actions {#ActionList} + +**Actions** specify what a mock function should do when invoked. + +#### Returning a Value + + +| | | +| :-------------------------------- | :-------------------------------------------- | +| `Return()` | Return from a `void` mock function. | +| `Return(value)` | Return `value`. If the type of `value` is different to the mock function's return type, `value` is converted to the latter type at the time the expectation is set, not when the action is executed. | +| `ReturnArg()` | Return the `N`-th (0-based) argument. | +| `ReturnNew(a1, ..., ak)` | Return `new T(a1, ..., ak)`; a different object is created each time. | +| `ReturnNull()` | Return a null pointer. | +| `ReturnPointee(ptr)` | Return the value pointed to by `ptr`. | +| `ReturnRef(variable)` | Return a reference to `variable`. | +| `ReturnRefOfCopy(value)` | Return a reference to a copy of `value`; the copy lives as long as the action. | +| `ReturnRoundRobin({a1, ..., ak})` | Each call will return the next `ai` in the list, starting at the beginning when the end of the list is reached. | + + +#### Side Effects + + +| | | +| :--------------------------------- | :-------------------------------------- | +| `Assign(&variable, value)` | Assign `value` to variable. | +| `DeleteArg()` | Delete the `N`-th (0-based) argument, which must be a pointer. | +| `SaveArg(pointer)` | Save the `N`-th (0-based) argument to `*pointer`. | +| `SaveArgPointee(pointer)` | Save the value pointed to by the `N`-th (0-based) argument to `*pointer`. | +| `SetArgReferee(value)` | Assign value to the variable referenced by the `N`-th (0-based) argument. | +| `SetArgPointee(value)` | Assign `value` to the variable pointed by the `N`-th (0-based) argument. | +| `SetArgumentPointee(value)` | Same as `SetArgPointee(value)`. Deprecated. Will be removed in v1.7.0. | +| `SetArrayArgument(first, last)` | Copies the elements in source range [`first`, `last`) to the array pointed to by the `N`-th (0-based) argument, which can be either a pointer or an iterator. The action does not take ownership of the elements in the source range. | +| `SetErrnoAndReturn(error, value)` | Set `errno` to `error` and return `value`. | +| `Throw(exception)` | Throws the given exception, which can be any copyable value. Available since v1.1.0. | + + +#### Using a Function, Functor, or Lambda as an Action + +In the following, by "callable" we mean a free function, `std::function`, +functor, or lambda. + + +| | | +| :---------------------------------- | :------------------------------------- | +| `f` | Invoke f with the arguments passed to the mock function, where f is a callable. | +| `Invoke(f)` | Invoke `f` with the arguments passed to the mock function, where `f` can be a global/static function or a functor. | +| `Invoke(object_pointer, &class::method)` | Invoke the method on the object with the arguments passed to the mock function. | +| `InvokeWithoutArgs(f)` | Invoke `f`, which can be a global/static function or a functor. `f` must take no arguments. | +| `InvokeWithoutArgs(object_pointer, &class::method)` | Invoke the method on the object, which takes no arguments. | +| `InvokeArgument(arg1, arg2, ..., argk)` | Invoke the mock function's `N`-th (0-based) argument, which must be a function or a functor, with the `k` arguments. | + + +The return value of the invoked function is used as the return value of the +action. + +When defining a callable to be used with `Invoke*()`, you can declare any unused +parameters as `Unused`: + +```cpp +using ::testing::Invoke; +double Distance(Unused, double x, double y) { return sqrt(x*x + y*y); } +... +EXPECT_CALL(mock, Foo("Hi", _, _)).WillOnce(Invoke(Distance)); +``` + +`Invoke(callback)` and `InvokeWithoutArgs(callback)` take ownership of +`callback`, which must be permanent. The type of `callback` must be a base +callback type instead of a derived one, e.g. + +```cpp + BlockingClosure* done = new BlockingClosure; + ... Invoke(done) ...; // This won't compile! + + Closure* done2 = new BlockingClosure; + ... Invoke(done2) ...; // This works. +``` + +In `InvokeArgument(...)`, if an argument needs to be passed by reference, +wrap it inside `ByRef()`. For example, + +```cpp +using ::testing::ByRef; +using ::testing::InvokeArgument; +... +InvokeArgument<2>(5, string("Hi"), ByRef(foo)) +``` + +calls the mock function's #2 argument, passing to it `5` and `string("Hi")` by +value, and `foo` by reference. + +#### Default Action + + +| Matcher | Description | +| :------------ | :----------------------------------------------------- | +| `DoDefault()` | Do the default action (specified by `ON_CALL()` or the built-in one). | + + +**Note:** due to technical reasons, `DoDefault()` cannot be used inside a +composite action - trying to do so will result in a run-time error. + + + +#### Composite Actions + + +| | | +| :----------------------------- | :------------------------------------------ | +| `DoAll(a1, a2, ..., an)` | Do all actions `a1` to `an` and return the result of `an` in each invocation. The first `n - 1` sub-actions must return void. | +| `IgnoreResult(a)` | Perform action `a` and ignore its result. `a` must not return void. | +| `WithArg(a)` | Pass the `N`-th (0-based) argument of the mock function to action `a` and perform it. | +| `WithArgs(a)` | Pass the selected (0-based) arguments of the mock function to action `a` and perform it. | +| `WithoutArgs(a)` | Perform action `a` without any arguments. | + + +#### Defining Actions + + +| | | +| :--------------------------------- | :-------------------------------------- | +| `ACTION(Sum) { return arg0 + arg1; }` | Defines an action `Sum()` to return the sum of the mock function's argument #0 and #1. | +| `ACTION_P(Plus, n) { return arg0 + n; }` | Defines an action `Plus(n)` to return the sum of the mock function's argument #0 and `n`. | +| `ACTION_Pk(Foo, p1, ..., pk) { statements; }` | Defines a parameterized action `Foo(p1, ..., pk)` to execute the given `statements`. | + + +The `ACTION*` macros cannot be used inside a function or class. + +### Cardinalities {#CardinalityList} + +These are used in `Times()` to specify how many times a mock function will be +called: + + +| | | +| :---------------- | :----------------------------------------------------- | +| `AnyNumber()` | The function can be called any number of times. | +| `AtLeast(n)` | The call is expected at least `n` times. | +| `AtMost(n)` | The call is expected at most `n` times. | +| `Between(m, n)` | The call is expected between `m` and `n` (inclusive) times. | +| `Exactly(n) or n` | The call is expected exactly `n` times. In particular, the call should never happen when `n` is 0. | + + +### Expectation Order + +By default, the expectations can be matched in *any* order. If some or all +expectations must be matched in a given order, there are two ways to specify it. +They can be used either independently or together. + +#### The After Clause {#AfterClause} + +```cpp +using ::testing::Expectation; +... +Expectation init_x = EXPECT_CALL(foo, InitX()); +Expectation init_y = EXPECT_CALL(foo, InitY()); +EXPECT_CALL(foo, Bar()) + .After(init_x, init_y); +``` + +says that `Bar()` can be called only after both `InitX()` and `InitY()` have +been called. + +If you don't know how many pre-requisites an expectation has when you write it, +you can use an `ExpectationSet` to collect them: + +```cpp +using ::testing::ExpectationSet; +... +ExpectationSet all_inits; +for (int i = 0; i < element_count; i++) { + all_inits += EXPECT_CALL(foo, InitElement(i)); +} +EXPECT_CALL(foo, Bar()) + .After(all_inits); +``` + +says that `Bar()` can be called only after all elements have been initialized +(but we don't care about which elements get initialized before the others). + +Modifying an `ExpectationSet` after using it in an `.After()` doesn't affect the +meaning of the `.After()`. + +#### Sequences {#UsingSequences} + +When you have a long chain of sequential expectations, it's easier to specify +the order using **sequences**, which don't require you to given each expectation +in the chain a different name. *All expected calls* in the same sequence must +occur in the order they are specified. + +```cpp +using ::testing::Return; +using ::testing::Sequence; +Sequence s1, s2; +... +EXPECT_CALL(foo, Reset()) + .InSequence(s1, s2) + .WillOnce(Return(true)); +EXPECT_CALL(foo, GetSize()) + .InSequence(s1) + .WillOnce(Return(1)); +EXPECT_CALL(foo, Describe(A())) + .InSequence(s2) + .WillOnce(Return("dummy")); +``` + +says that `Reset()` must be called before *both* `GetSize()` *and* `Describe()`, +and the latter two can occur in any order. + +To put many expectations in a sequence conveniently: + +```cpp +using ::testing::InSequence; +{ + InSequence seq; + + EXPECT_CALL(...)...; + EXPECT_CALL(...)...; + ... + EXPECT_CALL(...)...; +} +``` + +says that all expected calls in the scope of `seq` must occur in strict order. +The name `seq` is irrelevant. + +### Verifying and Resetting a Mock + +gMock will verify the expectations on a mock object when it is destructed, or +you can do it earlier: + +```cpp +using ::testing::Mock; +... +// Verifies and removes the expectations on mock_obj; +// returns true if and only if successful. +Mock::VerifyAndClearExpectations(&mock_obj); +... +// Verifies and removes the expectations on mock_obj; +// also removes the default actions set by ON_CALL(); +// returns true if and only if successful. +Mock::VerifyAndClear(&mock_obj); +``` + +You can also tell gMock that a mock object can be leaked and doesn't need to be +verified: + +```cpp +Mock::AllowLeak(&mock_obj); +``` + +### Mock Classes + +gMock defines a convenient mock class template + +```cpp +class MockFunction { + public: + MOCK_METHOD(R, Call, (A1, ..., An)); +}; +``` + +See this [recipe](cook_book.md#using-check-points) for one application of it. + +### Flags + + +| Flag | Description | +| :----------------------------- | :---------------------------------------- | +| `--gmock_catch_leaked_mocks=0` | Don't report leaked mock objects as failures. | +| `--gmock_verbose=LEVEL` | Sets the default verbosity level (`info`, `warning`, or `error`) of Google Mock messages. | + diff --git a/GraphBLAS/CUDA/test/googlemock/docs/cook_book.md b/GraphBLAS/CUDA/test/googlemock/docs/cook_book.md new file mode 100644 index 0000000000..51eb94a9ad --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/docs/cook_book.md @@ -0,0 +1,4271 @@ +# gMock Cookbook + + + +You can find recipes for using gMock here. If you haven't yet, please read +[this](for_dummies.md) first to make sure you understand the basics. + +**Note:** gMock lives in the `testing` name space. For readability, it is +recommended to write `using ::testing::Foo;` once in your file before using the +name `Foo` defined by gMock. We omit such `using` statements in this section for +brevity, but you should do it in your own code. + +## Creating Mock Classes + +Mock classes are defined as normal classes, using the `MOCK_METHOD` macro to +generate mocked methods. The macro gets 3 or 4 parameters: + +```cpp +class MyMock { + public: + MOCK_METHOD(ReturnType, MethodName, (Args...)); + MOCK_METHOD(ReturnType, MethodName, (Args...), (Specs...)); +}; +``` + +The first 3 parameters are simply the method declaration, split into 3 parts. +The 4th parameter accepts a closed list of qualifiers, which affect the +generated method: + +* **`const`** - Makes the mocked method a `const` method. Required if + overriding a `const` method. +* **`override`** - Marks the method with `override`. Recommended if overriding + a `virtual` method. +* **`noexcept`** - Marks the method with `noexcept`. Required if overriding a + `noexcept` method. +* **`Calltype(...)`** - Sets the call type for the method (e.g. to + `STDMETHODCALLTYPE`), useful in Windows. + +### Dealing with unprotected commas + +Unprotected commas, i.e. commas which are not surrounded by parentheses, prevent +`MOCK_METHOD` from parsing its arguments correctly: + +```cpp {.bad} +class MockFoo { + public: + MOCK_METHOD(std::pair, GetPair, ()); // Won't compile! + MOCK_METHOD(bool, CheckMap, (std::map, bool)); // Won't compile! +}; +``` + +Solution 1 - wrap with parentheses: + +```cpp {.good} +class MockFoo { + public: + MOCK_METHOD((std::pair), GetPair, ()); + MOCK_METHOD(bool, CheckMap, ((std::map), bool)); +}; +``` + +Note that wrapping a return or argument type with parentheses is, in general, +invalid C++. `MOCK_METHOD` removes the parentheses. + +Solution 2 - define an alias: + +```cpp {.good} +class MockFoo { + public: + using BoolAndInt = std::pair; + MOCK_METHOD(BoolAndInt, GetPair, ()); + using MapIntDouble = std::map; + MOCK_METHOD(bool, CheckMap, (MapIntDouble, bool)); +}; +``` + +### Mocking Private or Protected Methods + +You must always put a mock method definition (`MOCK_METHOD`) in a `public:` +section of the mock class, regardless of the method being mocked being `public`, +`protected`, or `private` in the base class. This allows `ON_CALL` and +`EXPECT_CALL` to reference the mock function from outside of the mock class. +(Yes, C++ allows a subclass to change the access level of a virtual function in +the base class.) Example: + +```cpp +class Foo { + public: + ... + virtual bool Transform(Gadget* g) = 0; + + protected: + virtual void Resume(); + + private: + virtual int GetTimeOut(); +}; + +class MockFoo : public Foo { + public: + ... + MOCK_METHOD(bool, Transform, (Gadget* g), (override)); + + // The following must be in the public section, even though the + // methods are protected or private in the base class. + MOCK_METHOD(void, Resume, (), (override)); + MOCK_METHOD(int, GetTimeOut, (), (override)); +}; +``` + +### Mocking Overloaded Methods + +You can mock overloaded functions as usual. No special attention is required: + +```cpp +class Foo { + ... + + // Must be virtual as we'll inherit from Foo. + virtual ~Foo(); + + // Overloaded on the types and/or numbers of arguments. + virtual int Add(Element x); + virtual int Add(int times, Element x); + + // Overloaded on the const-ness of this object. + virtual Bar& GetBar(); + virtual const Bar& GetBar() const; +}; + +class MockFoo : public Foo { + ... + MOCK_METHOD(int, Add, (Element x), (override)); + MOCK_METHOD(int, Add, (int times, Element x), (override)); + + MOCK_METHOD(Bar&, GetBar, (), (override)); + MOCK_METHOD(const Bar&, GetBar, (), (const, override)); +}; +``` + +**Note:** if you don't mock all versions of the overloaded method, the compiler +will give you a warning about some methods in the base class being hidden. To +fix that, use `using` to bring them in scope: + +```cpp +class MockFoo : public Foo { + ... + using Foo::Add; + MOCK_METHOD(int, Add, (Element x), (override)); + // We don't want to mock int Add(int times, Element x); + ... +}; +``` + +### Mocking Class Templates + +You can mock class templates just like any class. + +```cpp +template +class StackInterface { + ... + // Must be virtual as we'll inherit from StackInterface. + virtual ~StackInterface(); + + virtual int GetSize() const = 0; + virtual void Push(const Elem& x) = 0; +}; + +template +class MockStack : public StackInterface { + ... + MOCK_METHOD(int, GetSize, (), (override)); + MOCK_METHOD(void, Push, (const Elem& x), (override)); +}; +``` + +### Mocking Non-virtual Methods {#MockingNonVirtualMethods} + +gMock can mock non-virtual functions to be used in Hi-perf dependency +injection. + +In this case, instead of sharing a common base class with the real class, your +mock class will be *unrelated* to the real class, but contain methods with the +same signatures. The syntax for mocking non-virtual methods is the *same* as +mocking virtual methods (just don't add `override`): + +```cpp +// A simple packet stream class. None of its members is virtual. +class ConcretePacketStream { + public: + void AppendPacket(Packet* new_packet); + const Packet* GetPacket(size_t packet_number) const; + size_t NumberOfPackets() const; + ... +}; + +// A mock packet stream class. It inherits from no other, but defines +// GetPacket() and NumberOfPackets(). +class MockPacketStream { + public: + MOCK_METHOD(const Packet*, GetPacket, (size_t packet_number), (const)); + MOCK_METHOD(size_t, NumberOfPackets, (), (const)); + ... +}; +``` + +Note that the mock class doesn't define `AppendPacket()`, unlike the real class. +That's fine as long as the test doesn't need to call it. + +Next, you need a way to say that you want to use `ConcretePacketStream` in +production code, and use `MockPacketStream` in tests. Since the functions are +not virtual and the two classes are unrelated, you must specify your choice at +*compile time* (as opposed to run time). + +One way to do it is to templatize your code that needs to use a packet stream. +More specifically, you will give your code a template type argument for the type +of the packet stream. In production, you will instantiate your template with +`ConcretePacketStream` as the type argument. In tests, you will instantiate the +same template with `MockPacketStream`. For example, you may write: + +```cpp +template +void CreateConnection(PacketStream* stream) { ... } + +template +class PacketReader { + public: + void ReadPackets(PacketStream* stream, size_t packet_num); +}; +``` + +Then you can use `CreateConnection()` and +`PacketReader` in production code, and use +`CreateConnection()` and `PacketReader` in +tests. + +```cpp + MockPacketStream mock_stream; + EXPECT_CALL(mock_stream, ...)...; + .. set more expectations on mock_stream ... + PacketReader reader(&mock_stream); + ... exercise reader ... +``` + +### Mocking Free Functions + +It's possible to use gMock to mock a free function (i.e. a C-style function or a +static method). You just need to rewrite your code to use an interface (abstract +class). + +Instead of calling a free function (say, `OpenFile`) directly, introduce an +interface for it and have a concrete subclass that calls the free function: + +```cpp +class FileInterface { + public: + ... + virtual bool Open(const char* path, const char* mode) = 0; +}; + +class File : public FileInterface { + public: + ... + virtual bool Open(const char* path, const char* mode) { + return OpenFile(path, mode); + } +}; +``` + +Your code should talk to `FileInterface` to open a file. Now it's easy to mock +out the function. + +This may seem like a lot of hassle, but in practice you often have multiple +related functions that you can put in the same interface, so the per-function +syntactic overhead will be much lower. + +If you are concerned about the performance overhead incurred by virtual +functions, and profiling confirms your concern, you can combine this with the +recipe for [mocking non-virtual methods](#MockingNonVirtualMethods). + +### Old-Style `MOCK_METHODn` Macros + +Before the generic `MOCK_METHOD` macro was introduced, mocks where created using +a family of macros collectively called `MOCK_METHODn`. These macros are still +supported, though migration to the new `MOCK_METHOD` is recommended. + +The macros in the `MOCK_METHODn` family differ from `MOCK_METHOD`: + +* The general structure is `MOCK_METHODn(MethodName, ReturnType(Args))`, + instead of `MOCK_METHOD(ReturnType, MethodName, (Args))`. +* The number `n` must equal the number of arguments. +* When mocking a const method, one must use `MOCK_CONST_METHODn`. +* When mocking a class template, the macro name must be suffixed with `_T`. +* In order to specify the call type, the macro name must be suffixed with + `_WITH_CALLTYPE`, and the call type is the first macro argument. + +Old macros and their new equivalents: + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Simple
Old `MOCK_METHOD1(Foo, bool(int))`
New `MOCK_METHOD(bool, Foo, (int))`
Const Method
Old +`MOCK_CONST_METHOD1(Foo, bool(int))`
New +`MOCK_METHOD(bool, Foo, (int), (const))`
Method in a Class Template
Old `MOCK_METHOD1_T(Foo, bool(int))`
New +`MOCK_METHOD(bool, Foo, (int))`
Const Method in a Class Template
Old + `MOCK_CONST_METHOD1_T(Foo, bool(int))`
New + `MOCK_METHOD(bool, Foo, (int), (const))`
Method with Call Type
Old +`MOCK_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, bool(int))`
New `MOCK_METHOD(bool, Foo, (int), +(Calltype(STDMETHODCALLTYPE)))`
Const Method with Call Type
Old `MOCK_CONST_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, bool(int))`
New `MOCK_METHOD(bool, Foo, (int), (const, +Calltype(STDMETHODCALLTYPE)))`
Method with Call Type in a Class Template
Old `MOCK_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, +bool(int))`
New `MOCK_METHOD(bool, Foo, (int), +(Calltype(STDMETHODCALLTYPE)))`
Const Method with Call Type in a Class Template
Old `MOCK_CONST_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE, +Foo, bool(int))`
New `MOCK_METHOD(bool, Foo, +(int), (const, Calltype(STDMETHODCALLTYPE)))`
+ +### The Nice, the Strict, and the Naggy {#NiceStrictNaggy} + +If a mock method has no `EXPECT_CALL` spec but is called, we say that it's an +"uninteresting call", and the default action (which can be specified using +`ON_CALL()`) of the method will be taken. Currently, an uninteresting call will +also by default cause gMock to print a warning. (In the future, we might remove +this warning by default.) + +However, sometimes you may want to ignore these uninteresting calls, and +sometimes you may want to treat them as errors. gMock lets you make the decision +on a per-mock-object basis. + +Suppose your test uses a mock class `MockFoo`: + +```cpp +TEST(...) { + MockFoo mock_foo; + EXPECT_CALL(mock_foo, DoThis()); + ... code that uses mock_foo ... +} +``` + +If a method of `mock_foo` other than `DoThis()` is called, you will get a +warning. However, if you rewrite your test to use `NiceMock` instead, +you can suppress the warning: + +```cpp +using ::testing::NiceMock; + +TEST(...) { + NiceMock mock_foo; + EXPECT_CALL(mock_foo, DoThis()); + ... code that uses mock_foo ... +} +``` + +`NiceMock` is a subclass of `MockFoo`, so it can be used wherever +`MockFoo` is accepted. + +It also works if `MockFoo`'s constructor takes some arguments, as +`NiceMock` "inherits" `MockFoo`'s constructors: + +```cpp +using ::testing::NiceMock; + +TEST(...) { + NiceMock mock_foo(5, "hi"); // Calls MockFoo(5, "hi"). + EXPECT_CALL(mock_foo, DoThis()); + ... code that uses mock_foo ... +} +``` + +The usage of `StrictMock` is similar, except that it makes all uninteresting +calls failures: + +```cpp +using ::testing::StrictMock; + +TEST(...) { + StrictMock mock_foo; + EXPECT_CALL(mock_foo, DoThis()); + ... code that uses mock_foo ... + + // The test will fail if a method of mock_foo other than DoThis() + // is called. +} +``` + +NOTE: `NiceMock` and `StrictMock` only affects *uninteresting* calls (calls of +*methods* with no expectations); they do not affect *unexpected* calls (calls of +methods with expectations, but they don't match). See +[Understanding Uninteresting vs Unexpected Calls](#uninteresting-vs-unexpected). + +There are some caveats though (I dislike them just as much as the next guy, but +sadly they are side effects of C++'s limitations): + +1. `NiceMock` and `StrictMock` only work for mock methods + defined using the `MOCK_METHOD` macro **directly** in the `MockFoo` class. + If a mock method is defined in a **base class** of `MockFoo`, the "nice" or + "strict" modifier may not affect it, depending on the compiler. In + particular, nesting `NiceMock` and `StrictMock` (e.g. + `NiceMock >`) is **not** supported. +2. `NiceMock` and `StrictMock` may not work correctly if the + destructor of `MockFoo` is not virtual. We would like to fix this, but it + requires cleaning up existing tests. +3. During the constructor or destructor of `MockFoo`, the mock object is *not* + nice or strict. This may cause surprises if the constructor or destructor + calls a mock method on `this` object. (This behavior, however, is consistent + with C++'s general rule: if a constructor or destructor calls a virtual + method of `this` object, that method is treated as non-virtual. In other + words, to the base class's constructor or destructor, `this` object behaves + like an instance of the base class, not the derived class. This rule is + required for safety. Otherwise a base constructor may use members of a + derived class before they are initialized, or a base destructor may use + members of a derived class after they have been destroyed.) + +Finally, you should be **very cautious** about when to use naggy or strict +mocks, as they tend to make tests more brittle and harder to maintain. When you +refactor your code without changing its externally visible behavior, ideally you +shouldn't need to update any tests. If your code interacts with a naggy mock, +however, you may start to get spammed with warnings as the result of your +change. Worse, if your code interacts with a strict mock, your tests may start +to fail and you'll be forced to fix them. Our general recommendation is to use +nice mocks (not yet the default) most of the time, use naggy mocks (the current +default) when developing or debugging tests, and use strict mocks only as the +last resort. + +### Simplifying the Interface without Breaking Existing Code {#SimplerInterfaces} + +Sometimes a method has a long list of arguments that is mostly uninteresting. +For example: + +```cpp +class LogSink { + public: + ... + virtual void send(LogSeverity severity, const char* full_filename, + const char* base_filename, int line, + const struct tm* tm_time, + const char* message, size_t message_len) = 0; +}; +``` + +This method's argument list is lengthy and hard to work with (the `message` +argument is not even 0-terminated). If we mock it as is, using the mock will be +awkward. If, however, we try to simplify this interface, we'll need to fix all +clients depending on it, which is often infeasible. + +The trick is to redispatch the method in the mock class: + +```cpp +class ScopedMockLog : public LogSink { + public: + ... + virtual void send(LogSeverity severity, const char* full_filename, + const char* base_filename, int line, const tm* tm_time, + const char* message, size_t message_len) { + // We are only interested in the log severity, full file name, and + // log message. + Log(severity, full_filename, std::string(message, message_len)); + } + + // Implements the mock method: + // + // void Log(LogSeverity severity, + // const string& file_path, + // const string& message); + MOCK_METHOD(void, Log, + (LogSeverity severity, const string& file_path, + const string& message)); +}; +``` + +By defining a new mock method with a trimmed argument list, we make the mock +class more user-friendly. + +This technique may also be applied to make overloaded methods more amenable to +mocking. For example, when overloads have been used to implement default +arguments: + +```cpp +class MockTurtleFactory : public TurtleFactory { + public: + Turtle* MakeTurtle(int length, int weight) override { ... } + Turtle* MakeTurtle(int length, int weight, int speed) override { ... } + + // the above methods delegate to this one: + MOCK_METHOD(Turtle*, DoMakeTurtle, ()); +}; +``` + +This allows tests that don't care which overload was invoked to avoid specifying +argument matchers: + +```cpp +ON_CALL(factory, DoMakeTurtle) + .WillByDefault(MakeMockTurtle()); +``` + +### Alternative to Mocking Concrete Classes + +Often you may find yourself using classes that don't implement interfaces. In +order to test your code that uses such a class (let's call it `Concrete`), you +may be tempted to make the methods of `Concrete` virtual and then mock it. + +Try not to do that. + +Making a non-virtual function virtual is a big decision. It creates an extension +point where subclasses can tweak your class' behavior. This weakens your control +on the class because now it's harder to maintain the class invariants. You +should make a function virtual only when there is a valid reason for a subclass +to override it. + +Mocking concrete classes directly is problematic as it creates a tight coupling +between the class and the tests - any small change in the class may invalidate +your tests and make test maintenance a pain. + +To avoid such problems, many programmers have been practicing "coding to +interfaces": instead of talking to the `Concrete` class, your code would define +an interface and talk to it. Then you implement that interface as an adaptor on +top of `Concrete`. In tests, you can easily mock that interface to observe how +your code is doing. + +This technique incurs some overhead: + +* You pay the cost of virtual function calls (usually not a problem). +* There is more abstraction for the programmers to learn. + +However, it can also bring significant benefits in addition to better +testability: + +* `Concrete`'s API may not fit your problem domain very well, as you may not + be the only client it tries to serve. By designing your own interface, you + have a chance to tailor it to your need - you may add higher-level + functionalities, rename stuff, etc instead of just trimming the class. This + allows you to write your code (user of the interface) in a more natural way, + which means it will be more readable, more maintainable, and you'll be more + productive. +* If `Concrete`'s implementation ever has to change, you don't have to rewrite + everywhere it is used. Instead, you can absorb the change in your + implementation of the interface, and your other code and tests will be + insulated from this change. + +Some people worry that if everyone is practicing this technique, they will end +up writing lots of redundant code. This concern is totally understandable. +However, there are two reasons why it may not be the case: + +* Different projects may need to use `Concrete` in different ways, so the best + interfaces for them will be different. Therefore, each of them will have its + own domain-specific interface on top of `Concrete`, and they will not be the + same code. +* If enough projects want to use the same interface, they can always share it, + just like they have been sharing `Concrete`. You can check in the interface + and the adaptor somewhere near `Concrete` (perhaps in a `contrib` + sub-directory) and let many projects use it. + +You need to weigh the pros and cons carefully for your particular problem, but +I'd like to assure you that the Java community has been practicing this for a +long time and it's a proven effective technique applicable in a wide variety of +situations. :-) + +### Delegating Calls to a Fake {#DelegatingToFake} + +Some times you have a non-trivial fake implementation of an interface. For +example: + +```cpp +class Foo { + public: + virtual ~Foo() {} + virtual char DoThis(int n) = 0; + virtual void DoThat(const char* s, int* p) = 0; +}; + +class FakeFoo : public Foo { + public: + char DoThis(int n) override { + return (n > 0) ? '+' : + (n < 0) ? '-' : '0'; + } + + void DoThat(const char* s, int* p) override { + *p = strlen(s); + } +}; +``` + +Now you want to mock this interface such that you can set expectations on it. +However, you also want to use `FakeFoo` for the default behavior, as duplicating +it in the mock object is, well, a lot of work. + +When you define the mock class using gMock, you can have it delegate its default +action to a fake class you already have, using this pattern: + +```cpp +class MockFoo : public Foo { + public: + // Normal mock method definitions using gMock. + MOCK_METHOD(char, DoThis, (int n), (override)); + MOCK_METHOD(void, DoThat, (const char* s, int* p), (override)); + + // Delegates the default actions of the methods to a FakeFoo object. + // This must be called *before* the custom ON_CALL() statements. + void DelegateToFake() { + ON_CALL(*this, DoThis).WillByDefault([this](int n) { + return fake_.DoThis(n); + }); + ON_CALL(*this, DoThat).WillByDefault([this](const char* s, int* p) { + fake_.DoThat(s, p); + }); + } + + private: + FakeFoo fake_; // Keeps an instance of the fake in the mock. +}; +``` + +With that, you can use `MockFoo` in your tests as usual. Just remember that if +you don't explicitly set an action in an `ON_CALL()` or `EXPECT_CALL()`, the +fake will be called upon to do it.: + +```cpp +using ::testing::_; + +TEST(AbcTest, Xyz) { + MockFoo foo; + + foo.DelegateToFake(); // Enables the fake for delegation. + + // Put your ON_CALL(foo, ...)s here, if any. + + // No action specified, meaning to use the default action. + EXPECT_CALL(foo, DoThis(5)); + EXPECT_CALL(foo, DoThat(_, _)); + + int n = 0; + EXPECT_EQ('+', foo.DoThis(5)); // FakeFoo::DoThis() is invoked. + foo.DoThat("Hi", &n); // FakeFoo::DoThat() is invoked. + EXPECT_EQ(2, n); +} +``` + +**Some tips:** + +* If you want, you can still override the default action by providing your own + `ON_CALL()` or using `.WillOnce()` / `.WillRepeatedly()` in `EXPECT_CALL()`. +* In `DelegateToFake()`, you only need to delegate the methods whose fake + implementation you intend to use. + +* The general technique discussed here works for overloaded methods, but + you'll need to tell the compiler which version you mean. To disambiguate a + mock function (the one you specify inside the parentheses of `ON_CALL()`), + use [this technique](#SelectOverload); to disambiguate a fake function (the + one you place inside `Invoke()`), use a `static_cast` to specify the + function's type. For instance, if class `Foo` has methods `char DoThis(int + n)` and `bool DoThis(double x) const`, and you want to invoke the latter, + you need to write `Invoke(&fake_, static_cast(&FakeFoo::DoThis))` instead of `Invoke(&fake_, &FakeFoo::DoThis)` + (The strange-looking thing inside the angled brackets of `static_cast` is + the type of a function pointer to the second `DoThis()` method.). + +* Having to mix a mock and a fake is often a sign of something gone wrong. + Perhaps you haven't got used to the interaction-based way of testing yet. Or + perhaps your interface is taking on too many roles and should be split up. + Therefore, **don't abuse this**. We would only recommend to do it as an + intermediate step when you are refactoring your code. + +Regarding the tip on mixing a mock and a fake, here's an example on why it may +be a bad sign: Suppose you have a class `System` for low-level system +operations. In particular, it does file and I/O operations. And suppose you want +to test how your code uses `System` to do I/O, and you just want the file +operations to work normally. If you mock out the entire `System` class, you'll +have to provide a fake implementation for the file operation part, which +suggests that `System` is taking on too many roles. + +Instead, you can define a `FileOps` interface and an `IOOps` interface and split +`System`'s functionalities into the two. Then you can mock `IOOps` without +mocking `FileOps`. + +### Delegating Calls to a Real Object + +When using testing doubles (mocks, fakes, stubs, and etc), sometimes their +behaviors will differ from those of the real objects. This difference could be +either intentional (as in simulating an error such that you can test the error +handling code) or unintentional. If your mocks have different behaviors than the +real objects by mistake, you could end up with code that passes the tests but +fails in production. + +You can use the *delegating-to-real* technique to ensure that your mock has the +same behavior as the real object while retaining the ability to validate calls. +This technique is very similar to the [delegating-to-fake](#DelegatingToFake) +technique, the difference being that we use a real object instead of a fake. +Here's an example: + +```cpp +using ::testing::AtLeast; + +class MockFoo : public Foo { + public: + MockFoo() { + // By default, all calls are delegated to the real object. + ON_CALL(*this, DoThis).WillByDefault([this](int n) { + return real_.DoThis(n); + }); + ON_CALL(*this, DoThat).WillByDefault([this](const char* s, int* p) { + real_.DoThat(s, p); + }); + ... + } + MOCK_METHOD(char, DoThis, ...); + MOCK_METHOD(void, DoThat, ...); + ... + private: + Foo real_; +}; + +... + MockFoo mock; + EXPECT_CALL(mock, DoThis()) + .Times(3); + EXPECT_CALL(mock, DoThat("Hi")) + .Times(AtLeast(1)); + ... use mock in test ... +``` + +With this, gMock will verify that your code made the right calls (with the right +arguments, in the right order, called the right number of times, etc), and a +real object will answer the calls (so the behavior will be the same as in +production). This gives you the best of both worlds. + +### Delegating Calls to a Parent Class + +Ideally, you should code to interfaces, whose methods are all pure virtual. In +reality, sometimes you do need to mock a virtual method that is not pure (i.e, +it already has an implementation). For example: + +```cpp +class Foo { + public: + virtual ~Foo(); + + virtual void Pure(int n) = 0; + virtual int Concrete(const char* str) { ... } +}; + +class MockFoo : public Foo { + public: + // Mocking a pure method. + MOCK_METHOD(void, Pure, (int n), (override)); + // Mocking a concrete method. Foo::Concrete() is shadowed. + MOCK_METHOD(int, Concrete, (const char* str), (override)); +}; +``` + +Sometimes you may want to call `Foo::Concrete()` instead of +`MockFoo::Concrete()`. Perhaps you want to do it as part of a stub action, or +perhaps your test doesn't need to mock `Concrete()` at all (but it would be +oh-so painful to have to define a new mock class whenever you don't need to mock +one of its methods). + +The trick is to leave a back door in your mock class for accessing the real +methods in the base class: + +```cpp +class MockFoo : public Foo { + public: + // Mocking a pure method. + MOCK_METHOD(void, Pure, (int n), (override)); + // Mocking a concrete method. Foo::Concrete() is shadowed. + MOCK_METHOD(int, Concrete, (const char* str), (override)); + + // Use this to call Concrete() defined in Foo. + int FooConcrete(const char* str) { return Foo::Concrete(str); } +}; +``` + +Now, you can call `Foo::Concrete()` inside an action by: + +```cpp +... + EXPECT_CALL(foo, Concrete).WillOnce([&foo](const char* str) { + return foo.FooConcrete(str); + }); +``` + +or tell the mock object that you don't want to mock `Concrete()`: + +```cpp +... + ON_CALL(foo, Concrete).WillByDefault([&foo](const char* str) { + return foo.FooConcrete(str); + }); +``` + +(Why don't we just write `{ return foo.Concrete(str); }`? If you do that, +`MockFoo::Concrete()` will be called (and cause an infinite recursion) since +`Foo::Concrete()` is virtual. That's just how C++ works.) + +## Using Matchers + +### Matching Argument Values Exactly + +You can specify exactly which arguments a mock method is expecting: + +```cpp +using ::testing::Return; +... + EXPECT_CALL(foo, DoThis(5)) + .WillOnce(Return('a')); + EXPECT_CALL(foo, DoThat("Hello", bar)); +``` + +### Using Simple Matchers + +You can use matchers to match arguments that have a certain property: + +```cpp +using ::testing::NotNull; +using ::testing::Return; +... + EXPECT_CALL(foo, DoThis(Ge(5))) // The argument must be >= 5. + .WillOnce(Return('a')); + EXPECT_CALL(foo, DoThat("Hello", NotNull())); + // The second argument must not be NULL. +``` + +A frequently used matcher is `_`, which matches anything: + +```cpp + EXPECT_CALL(foo, DoThat(_, NotNull())); +``` + + +### Combining Matchers {#CombiningMatchers} + +You can build complex matchers from existing ones using `AllOf()`, +`AllOfArray()`, `AnyOf()`, `AnyOfArray()` and `Not()`: + +```cpp +using ::testing::AllOf; +using ::testing::Gt; +using ::testing::HasSubstr; +using ::testing::Ne; +using ::testing::Not; +... + // The argument must be > 5 and != 10. + EXPECT_CALL(foo, DoThis(AllOf(Gt(5), + Ne(10)))); + + // The first argument must not contain sub-string "blah". + EXPECT_CALL(foo, DoThat(Not(HasSubstr("blah")), + NULL)); +``` + +### Casting Matchers {#SafeMatcherCast} + +gMock matchers are statically typed, meaning that the compiler can catch your +mistake if you use a matcher of the wrong type (for example, if you use `Eq(5)` +to match a `string` argument). Good for you! + +Sometimes, however, you know what you're doing and want the compiler to give you +some slack. One example is that you have a matcher for `long` and the argument +you want to match is `int`. While the two types aren't exactly the same, there +is nothing really wrong with using a `Matcher` to match an `int` - after +all, we can first convert the `int` argument to a `long` losslessly before +giving it to the matcher. + +To support this need, gMock gives you the `SafeMatcherCast(m)` function. It +casts a matcher `m` to type `Matcher`. To ensure safety, gMock checks that +(let `U` be the type `m` accepts : + +1. Type `T` can be *implicitly* cast to type `U`; +2. When both `T` and `U` are built-in arithmetic types (`bool`, integers, and + floating-point numbers), the conversion from `T` to `U` is not lossy (in + other words, any value representable by `T` can also be represented by `U`); + and +3. When `U` is a reference, `T` must also be a reference (as the underlying + matcher may be interested in the address of the `U` value). + +The code won't compile if any of these conditions isn't met. + +Here's one example: + +```cpp +using ::testing::SafeMatcherCast; + +// A base class and a child class. +class Base { ... }; +class Derived : public Base { ... }; + +class MockFoo : public Foo { + public: + MOCK_METHOD(void, DoThis, (Derived* derived), (override)); +}; + +... + MockFoo foo; + // m is a Matcher we got from somewhere. + EXPECT_CALL(foo, DoThis(SafeMatcherCast(m))); +``` + +If you find `SafeMatcherCast(m)` too limiting, you can use a similar function +`MatcherCast(m)`. The difference is that `MatcherCast` works as long as you +can `static_cast` type `T` to type `U`. + +`MatcherCast` essentially lets you bypass C++'s type system (`static_cast` isn't +always safe as it could throw away information, for example), so be careful not +to misuse/abuse it. + +### Selecting Between Overloaded Functions {#SelectOverload} + +If you expect an overloaded function to be called, the compiler may need some +help on which overloaded version it is. + +To disambiguate functions overloaded on the const-ness of this object, use the +`Const()` argument wrapper. + +```cpp +using ::testing::ReturnRef; + +class MockFoo : public Foo { + ... + MOCK_METHOD(Bar&, GetBar, (), (override)); + MOCK_METHOD(const Bar&, GetBar, (), (const, override)); +}; + +... + MockFoo foo; + Bar bar1, bar2; + EXPECT_CALL(foo, GetBar()) // The non-const GetBar(). + .WillOnce(ReturnRef(bar1)); + EXPECT_CALL(Const(foo), GetBar()) // The const GetBar(). + .WillOnce(ReturnRef(bar2)); +``` + +(`Const()` is defined by gMock and returns a `const` reference to its argument.) + +To disambiguate overloaded functions with the same number of arguments but +different argument types, you may need to specify the exact type of a matcher, +either by wrapping your matcher in `Matcher()`, or using a matcher whose +type is fixed (`TypedEq`, `An()`, etc): + +```cpp +using ::testing::An; +using ::testing::Matcher; +using ::testing::TypedEq; + +class MockPrinter : public Printer { + public: + MOCK_METHOD(void, Print, (int n), (override)); + MOCK_METHOD(void, Print, (char c), (override)); +}; + +TEST(PrinterTest, Print) { + MockPrinter printer; + + EXPECT_CALL(printer, Print(An())); // void Print(int); + EXPECT_CALL(printer, Print(Matcher(Lt(5)))); // void Print(int); + EXPECT_CALL(printer, Print(TypedEq('a'))); // void Print(char); + + printer.Print(3); + printer.Print(6); + printer.Print('a'); +} +``` + +### Performing Different Actions Based on the Arguments + +When a mock method is called, the *last* matching expectation that's still +active will be selected (think "newer overrides older"). So, you can make a +method do different things depending on its argument values like this: + +```cpp +using ::testing::_; +using ::testing::Lt; +using ::testing::Return; +... + // The default case. + EXPECT_CALL(foo, DoThis(_)) + .WillRepeatedly(Return('b')); + // The more specific case. + EXPECT_CALL(foo, DoThis(Lt(5))) + .WillRepeatedly(Return('a')); +``` + +Now, if `foo.DoThis()` is called with a value less than 5, `'a'` will be +returned; otherwise `'b'` will be returned. + +### Matching Multiple Arguments as a Whole + +Sometimes it's not enough to match the arguments individually. For example, we +may want to say that the first argument must be less than the second argument. +The `With()` clause allows us to match all arguments of a mock function as a +whole. For example, + +```cpp +using ::testing::_; +using ::testing::Ne; +using ::testing::Lt; +... + EXPECT_CALL(foo, InRange(Ne(0), _)) + .With(Lt()); +``` + +says that the first argument of `InRange()` must not be 0, and must be less than +the second argument. + +The expression inside `With()` must be a matcher of type `Matcher>`, where `A1`, ..., `An` are the types of the function arguments. + +You can also write `AllArgs(m)` instead of `m` inside `.With()`. The two forms +are equivalent, but `.With(AllArgs(Lt()))` is more readable than `.With(Lt())`. + +You can use `Args(m)` to match the `n` selected arguments (as a +tuple) against `m`. For example, + +```cpp +using ::testing::_; +using ::testing::AllOf; +using ::testing::Args; +using ::testing::Lt; +... + EXPECT_CALL(foo, Blah) + .With(AllOf(Args<0, 1>(Lt()), Args<1, 2>(Lt()))); +``` + +says that `Blah` will be called with arguments `x`, `y`, and `z` where `x < y < +z`. Note that in this example, it wasn't necessary specify the positional +matchers. + +As a convenience and example, gMock provides some matchers for 2-tuples, +including the `Lt()` matcher above. See [here](#MultiArgMatchers) for the +complete list. + +Note that if you want to pass the arguments to a predicate of your own (e.g. +`.With(Args<0, 1>(Truly(&MyPredicate)))`), that predicate MUST be written to +take a `std::tuple` as its argument; gMock will pass the `n` selected arguments +as *one* single tuple to the predicate. + +### Using Matchers as Predicates + +Have you noticed that a matcher is just a fancy predicate that also knows how to +describe itself? Many existing algorithms take predicates as arguments (e.g. +those defined in STL's `` header), and it would be a shame if gMock +matchers were not allowed to participate. + +Luckily, you can use a matcher where a unary predicate functor is expected by +wrapping it inside the `Matches()` function. For example, + +```cpp +#include +#include + +using ::testing::Matches; +using ::testing::Ge; + +vector v; +... +// How many elements in v are >= 10? +const int count = count_if(v.begin(), v.end(), Matches(Ge(10))); +``` + +Since you can build complex matchers from simpler ones easily using gMock, this +gives you a way to conveniently construct composite predicates (doing the same +using STL's `` header is just painful). For example, here's a +predicate that's satisfied by any number that is >= 0, <= 100, and != 50: + +```cpp +using testing::AllOf; +using testing::Ge; +using testing::Le; +using testing::Matches; +using testing::Ne; +... +Matches(AllOf(Ge(0), Le(100), Ne(50))) +``` + +### Using Matchers in googletest Assertions + +Since matchers are basically predicates that also know how to describe +themselves, there is a way to take advantage of them in googletest assertions. +It's called `ASSERT_THAT` and `EXPECT_THAT`: + +```cpp + ASSERT_THAT(value, matcher); // Asserts that value matches matcher. + EXPECT_THAT(value, matcher); // The non-fatal version. +``` + +For example, in a googletest test you can write: + +```cpp +#include "gmock/gmock.h" + +using ::testing::AllOf; +using ::testing::Ge; +using ::testing::Le; +using ::testing::MatchesRegex; +using ::testing::StartsWith; + +... + EXPECT_THAT(Foo(), StartsWith("Hello")); + EXPECT_THAT(Bar(), MatchesRegex("Line \\d+")); + ASSERT_THAT(Baz(), AllOf(Ge(5), Le(10))); +``` + +which (as you can probably guess) executes `Foo()`, `Bar()`, and `Baz()`, and +verifies that: + +* `Foo()` returns a string that starts with `"Hello"`. +* `Bar()` returns a string that matches regular expression `"Line \\d+"`. +* `Baz()` returns a number in the range [5, 10]. + +The nice thing about these macros is that *they read like English*. They +generate informative messages too. For example, if the first `EXPECT_THAT()` +above fails, the message will be something like: + +```cpp +Value of: Foo() + Actual: "Hi, world!" +Expected: starts with "Hello" +``` + +**Credit:** The idea of `(ASSERT|EXPECT)_THAT` was borrowed from Joe Walnes' +Hamcrest project, which adds `assertThat()` to JUnit. + +### Using Predicates as Matchers + +gMock provides a [built-in set](#MatcherList) of matchers. In case you find them +lacking, you can use an arbitrary unary predicate function or functor as a +matcher - as long as the predicate accepts a value of the type you want. You do +this by wrapping the predicate inside the `Truly()` function, for example: + +```cpp +using ::testing::Truly; + +int IsEven(int n) { return (n % 2) == 0 ? 1 : 0; } +... + // Bar() must be called with an even number. + EXPECT_CALL(foo, Bar(Truly(IsEven))); +``` + +Note that the predicate function / functor doesn't have to return `bool`. It +works as long as the return value can be used as the condition in in statement +`if (condition) ...`. + + + +### Matching Arguments that Are Not Copyable + +When you do an `EXPECT_CALL(mock_obj, Foo(bar))`, gMock saves away a copy of +`bar`. When `Foo()` is called later, gMock compares the argument to `Foo()` with +the saved copy of `bar`. This way, you don't need to worry about `bar` being +modified or destroyed after the `EXPECT_CALL()` is executed. The same is true +when you use matchers like `Eq(bar)`, `Le(bar)`, and so on. + +But what if `bar` cannot be copied (i.e. has no copy constructor)? You could +define your own matcher function or callback and use it with `Truly()`, as the +previous couple of recipes have shown. Or, you may be able to get away from it +if you can guarantee that `bar` won't be changed after the `EXPECT_CALL()` is +executed. Just tell gMock that it should save a reference to `bar`, instead of a +copy of it. Here's how: + +```cpp +using ::testing::ByRef; +using ::testing::Eq; +using ::testing::Lt; +... + // Expects that Foo()'s argument == bar. + EXPECT_CALL(mock_obj, Foo(Eq(ByRef(bar)))); + + // Expects that Foo()'s argument < bar. + EXPECT_CALL(mock_obj, Foo(Lt(ByRef(bar)))); +``` + +Remember: if you do this, don't change `bar` after the `EXPECT_CALL()`, or the +result is undefined. + +### Validating a Member of an Object + +Often a mock function takes a reference to object as an argument. When matching +the argument, you may not want to compare the entire object against a fixed +object, as that may be over-specification. Instead, you may need to validate a +certain member variable or the result of a certain getter method of the object. +You can do this with `Field()` and `Property()`. More specifically, + +```cpp +Field(&Foo::bar, m) +``` + +is a matcher that matches a `Foo` object whose `bar` member variable satisfies +matcher `m`. + +```cpp +Property(&Foo::baz, m) +``` + +is a matcher that matches a `Foo` object whose `baz()` method returns a value +that satisfies matcher `m`. + +For example: + + +| Expression | Description | +| :--------------------------- | :--------------------------------------- | +| `Field(&Foo::number, Ge(3))` | Matches `x` where `x.number >= 3`. | +| `Property(&Foo::name, StartsWith("John "))` | Matches `x` where `x.name()` starts with `"John "`. | + + +Note that in `Property(&Foo::baz, ...)`, method `baz()` must take no argument +and be declared as `const`. + +BTW, `Field()` and `Property()` can also match plain pointers to objects. For +instance, + +```cpp +using ::testing::Field; +using ::testing::Ge; +... +Field(&Foo::number, Ge(3)) +``` + +matches a plain pointer `p` where `p->number >= 3`. If `p` is `NULL`, the match +will always fail regardless of the inner matcher. + +What if you want to validate more than one members at the same time? Remember +that there are [`AllOf()` and `AllOfArray()`](#CombiningMatchers). + +Finally `Field()` and `Property()` provide overloads that take the field or +property names as the first argument to include it in the error message. This +can be useful when creating combined matchers. + +```cpp +using ::testing::AllOf; +using ::testing::Field; +using ::testing::Matcher; +using ::testing::SafeMatcherCast; + +Matcher IsFoo(const Foo& foo) { + return AllOf(Field("some_field", &Foo::some_field, foo.some_field), + Field("other_field", &Foo::other_field, foo.other_field), + Field("last_field", &Foo::last_field, foo.last_field)); +} +``` + +### Validating the Value Pointed to by a Pointer Argument + +C++ functions often take pointers as arguments. You can use matchers like +`IsNull()`, `NotNull()`, and other comparison matchers to match a pointer, but +what if you want to make sure the value *pointed to* by the pointer, instead of +the pointer itself, has a certain property? Well, you can use the `Pointee(m)` +matcher. + +`Pointee(m)` matches a pointer if and only if `m` matches the value the pointer +points to. For example: + +```cpp +using ::testing::Ge; +using ::testing::Pointee; +... + EXPECT_CALL(foo, Bar(Pointee(Ge(3)))); +``` + +expects `foo.Bar()` to be called with a pointer that points to a value greater +than or equal to 3. + +One nice thing about `Pointee()` is that it treats a `NULL` pointer as a match +failure, so you can write `Pointee(m)` instead of + +```cpp +using ::testing::AllOf; +using ::testing::NotNull; +using ::testing::Pointee; +... + AllOf(NotNull(), Pointee(m)) +``` + +without worrying that a `NULL` pointer will crash your test. + +Also, did we tell you that `Pointee()` works with both raw pointers **and** +smart pointers (`std::unique_ptr`, `std::shared_ptr`, etc)? + +What if you have a pointer to pointer? You guessed it - you can use nested +`Pointee()` to probe deeper inside the value. For example, +`Pointee(Pointee(Lt(3)))` matches a pointer that points to a pointer that points +to a number less than 3 (what a mouthful...). + +### Testing a Certain Property of an Object + +Sometimes you want to specify that an object argument has a certain property, +but there is no existing matcher that does this. If you want good error +messages, you should [define a matcher](#NewMatchers). If you want to do it +quick and dirty, you could get away with writing an ordinary function. + +Let's say you have a mock function that takes an object of type `Foo`, which has +an `int bar()` method and an `int baz()` method, and you want to constrain that +the argument's `bar()` value plus its `baz()` value is a given number. Here's +how you can define a matcher to do it: + +```cpp +using ::testing::Matcher; +using ::testing::MatcherInterface; +using ::testing::MatchResultListener; + +class BarPlusBazEqMatcher : public MatcherInterface { + public: + explicit BarPlusBazEqMatcher(int expected_sum) + : expected_sum_(expected_sum) {} + + bool MatchAndExplain(const Foo& foo, + MatchResultListener* /* listener */) const override { + return (foo.bar() + foo.baz()) == expected_sum_; + } + + void DescribeTo(std::ostream* os) const override { + *os << "bar() + baz() equals " << expected_sum_; + } + + void DescribeNegationTo(std::ostream* os) const override { + *os << "bar() + baz() does not equal " << expected_sum_; + } + private: + const int expected_sum_; +}; + +Matcher BarPlusBazEq(int expected_sum) { + return MakeMatcher(new BarPlusBazEqMatcher(expected_sum)); +} + +... + EXPECT_CALL(..., DoThis(BarPlusBazEq(5)))...; +``` + +### Matching Containers + +Sometimes an STL container (e.g. list, vector, map, ...) is passed to a mock +function and you may want to validate it. Since most STL containers support the +`==` operator, you can write `Eq(expected_container)` or simply +`expected_container` to match a container exactly. + +Sometimes, though, you may want to be more flexible (for example, the first +element must be an exact match, but the second element can be any positive +number, and so on). Also, containers used in tests often have a small number of +elements, and having to define the expected container out-of-line is a bit of a +hassle. + +You can use the `ElementsAre()` or `UnorderedElementsAre()` matcher in such +cases: + +```cpp +using ::testing::_; +using ::testing::ElementsAre; +using ::testing::Gt; +... + MOCK_METHOD(void, Foo, (const vector& numbers), (override)); +... + EXPECT_CALL(mock, Foo(ElementsAre(1, Gt(0), _, 5))); +``` + +The above matcher says that the container must have 4 elements, which must be 1, +greater than 0, anything, and 5 respectively. + +If you instead write: + +```cpp +using ::testing::_; +using ::testing::Gt; +using ::testing::UnorderedElementsAre; +... + MOCK_METHOD(void, Foo, (const vector& numbers), (override)); +... + EXPECT_CALL(mock, Foo(UnorderedElementsAre(1, Gt(0), _, 5))); +``` + +It means that the container must have 4 elements, which (under some permutation) +must be 1, greater than 0, anything, and 5 respectively. + +As an alternative you can place the arguments in a C-style array and use +`ElementsAreArray()` or `UnorderedElementsAreArray()` instead: + +```cpp +using ::testing::ElementsAreArray; +... + // ElementsAreArray accepts an array of element values. + const int expected_vector1[] = {1, 5, 2, 4, ...}; + EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector1))); + + // Or, an array of element matchers. + Matcher expected_vector2[] = {1, Gt(2), _, 3, ...}; + EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector2))); +``` + +In case the array needs to be dynamically created (and therefore the array size +cannot be inferred by the compiler), you can give `ElementsAreArray()` an +additional argument to specify the array size: + +```cpp +using ::testing::ElementsAreArray; +... + int* const expected_vector3 = new int[count]; + ... fill expected_vector3 with values ... + EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector3, count))); +``` + +Use `Pair` when comparing maps or other associative containers. + +```cpp +using testing::ElementsAre; +using testing::Pair; +... + std::map m = {{"a", 1}, {"b", 2}, {"c", 3}}; + EXPECT_THAT(m, ElementsAre(Pair("a", 1), Pair("b", 2), Pair("c", 3))); +``` + +**Tips:** + +* `ElementsAre*()` can be used to match *any* container that implements the + STL iterator pattern (i.e. it has a `const_iterator` type and supports + `begin()/end()`), not just the ones defined in STL. It will even work with + container types yet to be written - as long as they follows the above + pattern. +* You can use nested `ElementsAre*()` to match nested (multi-dimensional) + containers. +* If the container is passed by pointer instead of by reference, just write + `Pointee(ElementsAre*(...))`. +* The order of elements *matters* for `ElementsAre*()`. If you are using it + with containers whose element order are undefined (e.g. `hash_map`) you + should use `WhenSorted` around `ElementsAre`. + +### Sharing Matchers + +Under the hood, a gMock matcher object consists of a pointer to a ref-counted +implementation object. Copying matchers is allowed and very efficient, as only +the pointer is copied. When the last matcher that references the implementation +object dies, the implementation object will be deleted. + +Therefore, if you have some complex matcher that you want to use again and +again, there is no need to build it everytime. Just assign it to a matcher +variable and use that variable repeatedly! For example, + +```cpp +using ::testing::AllOf; +using ::testing::Gt; +using ::testing::Le; +using ::testing::Matcher; +... + Matcher in_range = AllOf(Gt(5), Le(10)); + ... use in_range as a matcher in multiple EXPECT_CALLs ... +``` + +### Matchers must have no side-effects {#PureMatchers} + +WARNING: gMock does not guarantee when or how many times a matcher will be +invoked. Therefore, all matchers must be *purely functional*: they cannot have +any side effects, and the match result must not depend on anything other than +the matcher's parameters and the value being matched. + +This requirement must be satisfied no matter how a matcher is defined (e.g., if +it is one of the standard matchers, or a custom matcher). In particular, a +matcher can never call a mock function, as that will affect the state of the +mock object and gMock. + +## Setting Expectations + +### Knowing When to Expect {#UseOnCall} + + + +**`ON_CALL`** is likely the *single most under-utilized construct* in gMock. + +There are basically two constructs for defining the behavior of a mock object: +`ON_CALL` and `EXPECT_CALL`. The difference? `ON_CALL` defines what happens when +a mock method is called, but doesn't imply any expectation on the method +being called. `EXPECT_CALL` not only defines the behavior, but also sets an +expectation that the method will be called with the given arguments, for the +given number of times (and *in the given order* when you specify the order +too). + +Since `EXPECT_CALL` does more, isn't it better than `ON_CALL`? Not really. Every +`EXPECT_CALL` adds a constraint on the behavior of the code under test. Having +more constraints than necessary is *baaad* - even worse than not having enough +constraints. + +This may be counter-intuitive. How could tests that verify more be worse than +tests that verify less? Isn't verification the whole point of tests? + +The answer lies in *what* a test should verify. **A good test verifies the +contract of the code.** If a test over-specifies, it doesn't leave enough +freedom to the implementation. As a result, changing the implementation without +breaking the contract (e.g. refactoring and optimization), which should be +perfectly fine to do, can break such tests. Then you have to spend time fixing +them, only to see them broken again the next time the implementation is changed. + +Keep in mind that one doesn't have to verify more than one property in one test. +In fact, **it's a good style to verify only one thing in one test.** If you do +that, a bug will likely break only one or two tests instead of dozens (which +case would you rather debug?). If you are also in the habit of giving tests +descriptive names that tell what they verify, you can often easily guess what's +wrong just from the test log itself. + +So use `ON_CALL` by default, and only use `EXPECT_CALL` when you actually intend +to verify that the call is made. For example, you may have a bunch of `ON_CALL`s +in your test fixture to set the common mock behavior shared by all tests in the +same group, and write (scarcely) different `EXPECT_CALL`s in different `TEST_F`s +to verify different aspects of the code's behavior. Compared with the style +where each `TEST` has many `EXPECT_CALL`s, this leads to tests that are more +resilient to implementational changes (and thus less likely to require +maintenance) and makes the intent of the tests more obvious (so they are easier +to maintain when you do need to maintain them). + +If you are bothered by the "Uninteresting mock function call" message printed +when a mock method without an `EXPECT_CALL` is called, you may use a `NiceMock` +instead to suppress all such messages for the mock object, or suppress the +message for specific methods by adding `EXPECT_CALL(...).Times(AnyNumber())`. DO +NOT suppress it by blindly adding an `EXPECT_CALL(...)`, or you'll have a test +that's a pain to maintain. + +### Ignoring Uninteresting Calls + +If you are not interested in how a mock method is called, just don't say +anything about it. In this case, if the method is ever called, gMock will +perform its default action to allow the test program to continue. If you are not +happy with the default action taken by gMock, you can override it using +`DefaultValue::Set()` (described [here](#DefaultValue)) or `ON_CALL()`. + +Please note that once you expressed interest in a particular mock method (via +`EXPECT_CALL()`), all invocations to it must match some expectation. If this +function is called but the arguments don't match any `EXPECT_CALL()` statement, +it will be an error. + +### Disallowing Unexpected Calls + +If a mock method shouldn't be called at all, explicitly say so: + +```cpp +using ::testing::_; +... + EXPECT_CALL(foo, Bar(_)) + .Times(0); +``` + +If some calls to the method are allowed, but the rest are not, just list all the +expected calls: + +```cpp +using ::testing::AnyNumber; +using ::testing::Gt; +... + EXPECT_CALL(foo, Bar(5)); + EXPECT_CALL(foo, Bar(Gt(10))) + .Times(AnyNumber()); +``` + +A call to `foo.Bar()` that doesn't match any of the `EXPECT_CALL()` statements +will be an error. + +### Understanding Uninteresting vs Unexpected Calls {#uninteresting-vs-unexpected} + +*Uninteresting* calls and *unexpected* calls are different concepts in gMock. +*Very* different. + +A call `x.Y(...)` is **uninteresting** if there's *not even a single* +`EXPECT_CALL(x, Y(...))` set. In other words, the test isn't interested in the +`x.Y()` method at all, as evident in that the test doesn't care to say anything +about it. + +A call `x.Y(...)` is **unexpected** if there are *some* `EXPECT_CALL(x, +Y(...))`s set, but none of them matches the call. Put another way, the test is +interested in the `x.Y()` method (therefore it explicitly sets some +`EXPECT_CALL` to verify how it's called); however, the verification fails as the +test doesn't expect this particular call to happen. + +**An unexpected call is always an error,** as the code under test doesn't behave +the way the test expects it to behave. + +**By default, an uninteresting call is not an error,** as it violates no +constraint specified by the test. (gMock's philosophy is that saying nothing +means there is no constraint.) However, it leads to a warning, as it *might* +indicate a problem (e.g. the test author might have forgotten to specify a +constraint). + +In gMock, `NiceMock` and `StrictMock` can be used to make a mock class "nice" or +"strict". How does this affect uninteresting calls and unexpected calls? + +A **nice mock** suppresses uninteresting call *warnings*. It is less chatty than +the default mock, but otherwise is the same. If a test fails with a default +mock, it will also fail using a nice mock instead. And vice versa. Don't expect +making a mock nice to change the test's result. + +A **strict mock** turns uninteresting call warnings into errors. So making a +mock strict may change the test's result. + +Let's look at an example: + +```cpp +TEST(...) { + NiceMock mock_registry; + EXPECT_CALL(mock_registry, GetDomainOwner("google.com")) + .WillRepeatedly(Return("Larry Page")); + + // Use mock_registry in code under test. + ... &mock_registry ... +} +``` + +The sole `EXPECT_CALL` here says that all calls to `GetDomainOwner()` must have +`"google.com"` as the argument. If `GetDomainOwner("yahoo.com")` is called, it +will be an unexpected call, and thus an error. *Having a nice mock doesn't +change the severity of an unexpected call.* + +So how do we tell gMock that `GetDomainOwner()` can be called with some other +arguments as well? The standard technique is to add a "catch all" `EXPECT_CALL`: + +```cpp + EXPECT_CALL(mock_registry, GetDomainOwner(_)) + .Times(AnyNumber()); // catches all other calls to this method. + EXPECT_CALL(mock_registry, GetDomainOwner("google.com")) + .WillRepeatedly(Return("Larry Page")); +``` + +Remember that `_` is the wildcard matcher that matches anything. With this, if +`GetDomainOwner("google.com")` is called, it will do what the second +`EXPECT_CALL` says; if it is called with a different argument, it will do what +the first `EXPECT_CALL` says. + +Note that the order of the two `EXPECT_CALL`s is important, as a newer +`EXPECT_CALL` takes precedence over an older one. + +For more on uninteresting calls, nice mocks, and strict mocks, read +["The Nice, the Strict, and the Naggy"](#NiceStrictNaggy). + +### Ignoring Uninteresting Arguments {#ParameterlessExpectations} + +If your test doesn't care about the parameters (it only cares about the number +or order of calls), you can often simply omit the parameter list: + +```cpp + // Expect foo.Bar( ... ) twice with any arguments. + EXPECT_CALL(foo, Bar).Times(2); + + // Delegate to the given method whenever the factory is invoked. + ON_CALL(foo_factory, MakeFoo) + .WillByDefault(&BuildFooForTest); +``` + +This functionality is only available when a method is not overloaded; to prevent +unexpected behavior it is a compilation error to try to set an expectation on a +method where the specific overload is ambiguous. You can work around this by +supplying a [simpler mock interface](#SimplerInterfaces) than the mocked class +provides. + +This pattern is also useful when the arguments are interesting, but match logic +is substantially complex. You can leave the argument list unspecified and use +SaveArg actions to [save the values for later verification](#SaveArgVerify). If +you do that, you can easily differentiate calling the method the wrong number of +times from calling it with the wrong arguments. + +### Expecting Ordered Calls {#OrderedCalls} + +Although an `EXPECT_CALL()` statement defined earlier takes precedence when +gMock tries to match a function call with an expectation, by default calls don't +have to happen in the order `EXPECT_CALL()` statements are written. For example, +if the arguments match the matchers in the third `EXPECT_CALL()`, but not those +in the first two, then the third expectation will be used. + +If you would rather have all calls occur in the order of the expectations, put +the `EXPECT_CALL()` statements in a block where you define a variable of type +`InSequence`: + +```cpp +using ::testing::_; +using ::testing::InSequence; + + { + InSequence s; + + EXPECT_CALL(foo, DoThis(5)); + EXPECT_CALL(bar, DoThat(_)) + .Times(2); + EXPECT_CALL(foo, DoThis(6)); + } +``` + +In this example, we expect a call to `foo.DoThis(5)`, followed by two calls to +`bar.DoThat()` where the argument can be anything, which are in turn followed by +a call to `foo.DoThis(6)`. If a call occurred out-of-order, gMock will report an +error. + +### Expecting Partially Ordered Calls {#PartialOrder} + +Sometimes requiring everything to occur in a predetermined order can lead to +brittle tests. For example, we may care about `A` occurring before both `B` and +`C`, but aren't interested in the relative order of `B` and `C`. In this case, +the test should reflect our real intent, instead of being overly constraining. + +gMock allows you to impose an arbitrary DAG (directed acyclic graph) on the +calls. One way to express the DAG is to use the [After](#AfterClause) clause of +`EXPECT_CALL`. + +Another way is via the `InSequence()` clause (not the same as the `InSequence` +class), which we borrowed from jMock 2. It's less flexible than `After()`, but +more convenient when you have long chains of sequential calls, as it doesn't +require you to come up with different names for the expectations in the chains. +Here's how it works: + +If we view `EXPECT_CALL()` statements as nodes in a graph, and add an edge from +node A to node B wherever A must occur before B, we can get a DAG. We use the +term "sequence" to mean a directed path in this DAG. Now, if we decompose the +DAG into sequences, we just need to know which sequences each `EXPECT_CALL()` +belongs to in order to be able to reconstruct the original DAG. + +So, to specify the partial order on the expectations we need to do two things: +first to define some `Sequence` objects, and then for each `EXPECT_CALL()` say +which `Sequence` objects it is part of. + +Expectations in the same sequence must occur in the order they are written. For +example, + +```cpp +using ::testing::Sequence; +... + Sequence s1, s2; + + EXPECT_CALL(foo, A()) + .InSequence(s1, s2); + EXPECT_CALL(bar, B()) + .InSequence(s1); + EXPECT_CALL(bar, C()) + .InSequence(s2); + EXPECT_CALL(foo, D()) + .InSequence(s2); +``` + +specifies the following DAG (where `s1` is `A -> B`, and `s2` is `A -> C -> D`): + +```text + +---> B + | + A ---| + | + +---> C ---> D +``` + +This means that A must occur before B and C, and C must occur before D. There's +no restriction about the order other than these. + +### Controlling When an Expectation Retires + +When a mock method is called, gMock only considers expectations that are still +active. An expectation is active when created, and becomes inactive (aka +*retires*) when a call that has to occur later has occurred. For example, in + +```cpp +using ::testing::_; +using ::testing::Sequence; +... + Sequence s1, s2; + + EXPECT_CALL(log, Log(WARNING, _, "File too large.")) // #1 + .Times(AnyNumber()) + .InSequence(s1, s2); + EXPECT_CALL(log, Log(WARNING, _, "Data set is empty.")) // #2 + .InSequence(s1); + EXPECT_CALL(log, Log(WARNING, _, "User not found.")) // #3 + .InSequence(s2); +``` + +as soon as either #2 or #3 is matched, #1 will retire. If a warning `"File too +large."` is logged after this, it will be an error. + +Note that an expectation doesn't retire automatically when it's saturated. For +example, + +```cpp +using ::testing::_; +... + EXPECT_CALL(log, Log(WARNING, _, _)); // #1 + EXPECT_CALL(log, Log(WARNING, _, "File too large.")); // #2 +``` + +says that there will be exactly one warning with the message `"File too +large."`. If the second warning contains this message too, #2 will match again +and result in an upper-bound-violated error. + +If this is not what you want, you can ask an expectation to retire as soon as it +becomes saturated: + +```cpp +using ::testing::_; +... + EXPECT_CALL(log, Log(WARNING, _, _)); // #1 + EXPECT_CALL(log, Log(WARNING, _, "File too large.")) // #2 + .RetiresOnSaturation(); +``` + +Here #2 can be used only once, so if you have two warnings with the message +`"File too large."`, the first will match #2 and the second will match #1 - +there will be no error. + +## Using Actions + +### Returning References from Mock Methods + +If a mock function's return type is a reference, you need to use `ReturnRef()` +instead of `Return()` to return a result: + +```cpp +using ::testing::ReturnRef; + +class MockFoo : public Foo { + public: + MOCK_METHOD(Bar&, GetBar, (), (override)); +}; +... + MockFoo foo; + Bar bar; + EXPECT_CALL(foo, GetBar()) + .WillOnce(ReturnRef(bar)); +... +``` + +### Returning Live Values from Mock Methods + +The `Return(x)` action saves a copy of `x` when the action is created, and +always returns the same value whenever it's executed. Sometimes you may want to +instead return the *live* value of `x` (i.e. its value at the time when the +action is *executed*.). Use either `ReturnRef()` or `ReturnPointee()` for this +purpose. + +If the mock function's return type is a reference, you can do it using +`ReturnRef(x)`, as shown in the previous recipe ("Returning References from Mock +Methods"). However, gMock doesn't let you use `ReturnRef()` in a mock function +whose return type is not a reference, as doing that usually indicates a user +error. So, what shall you do? + +Though you may be tempted, DO NOT use `ByRef()`: + +```cpp +using testing::ByRef; +using testing::Return; + +class MockFoo : public Foo { + public: + MOCK_METHOD(int, GetValue, (), (override)); +}; +... + int x = 0; + MockFoo foo; + EXPECT_CALL(foo, GetValue()) + .WillRepeatedly(Return(ByRef(x))); // Wrong! + x = 42; + EXPECT_EQ(42, foo.GetValue()); +``` + +Unfortunately, it doesn't work here. The above code will fail with error: + +```text +Value of: foo.GetValue() + Actual: 0 +Expected: 42 +``` + +The reason is that `Return(*value*)` converts `value` to the actual return type +of the mock function at the time when the action is *created*, not when it is +*executed*. (This behavior was chosen for the action to be safe when `value` is +a proxy object that references some temporary objects.) As a result, `ByRef(x)` +is converted to an `int` value (instead of a `const int&`) when the expectation +is set, and `Return(ByRef(x))` will always return 0. + +`ReturnPointee(pointer)` was provided to solve this problem specifically. It +returns the value pointed to by `pointer` at the time the action is *executed*: + +```cpp +using testing::ReturnPointee; +... + int x = 0; + MockFoo foo; + EXPECT_CALL(foo, GetValue()) + .WillRepeatedly(ReturnPointee(&x)); // Note the & here. + x = 42; + EXPECT_EQ(42, foo.GetValue()); // This will succeed now. +``` + +### Combining Actions + +Want to do more than one thing when a function is called? That's fine. `DoAll()` +allow you to do sequence of actions every time. Only the return value of the +last action in the sequence will be used. + +```cpp +using ::testing::_; +using ::testing::DoAll; + +class MockFoo : public Foo { + public: + MOCK_METHOD(bool, Bar, (int n), (override)); +}; +... + EXPECT_CALL(foo, Bar(_)) + .WillOnce(DoAll(action_1, + action_2, + ... + action_n)); +``` + +### Verifying Complex Arguments {#SaveArgVerify} + +If you want to verify that a method is called with a particular argument but the +match criteria is complex, it can be difficult to distinguish between +cardinality failures (calling the method the wrong number of times) and argument +match failures. Similarly, if you are matching multiple parameters, it may not +be easy to distinguishing which argument failed to match. For example: + +```cpp + // Not ideal: this could fail because of a problem with arg1 or arg2, or maybe + // just the method wasn't called. + EXPECT_CALL(foo, SendValues(_, ElementsAre(1, 4, 4, 7), EqualsProto( ... ))); +``` + +You can instead save the arguments and test them individually: + +```cpp + EXPECT_CALL(foo, SendValues) + .WillOnce(DoAll(SaveArg<1>(&actual_array), SaveArg<2>(&actual_proto))); + ... run the test + EXPECT_THAT(actual_array, ElementsAre(1, 4, 4, 7)); + EXPECT_THAT(actual_proto, EqualsProto( ... )); +``` + +### Mocking Side Effects {#MockingSideEffects} + +Sometimes a method exhibits its effect not via returning a value but via side +effects. For example, it may change some global state or modify an output +argument. To mock side effects, in general you can define your own action by +implementing `::testing::ActionInterface`. + +If all you need to do is to change an output argument, the built-in +`SetArgPointee()` action is convenient: + +```cpp +using ::testing::_; +using ::testing::SetArgPointee; + +class MockMutator : public Mutator { + public: + MOCK_METHOD(void, Mutate, (bool mutate, int* value), (override)); + ... +} +... + MockMutator mutator; + EXPECT_CALL(mutator, Mutate(true, _)) + .WillOnce(SetArgPointee<1>(5)); +``` + +In this example, when `mutator.Mutate()` is called, we will assign 5 to the +`int` variable pointed to by argument #1 (0-based). + +`SetArgPointee()` conveniently makes an internal copy of the value you pass to +it, removing the need to keep the value in scope and alive. The implication +however is that the value must have a copy constructor and assignment operator. + +If the mock method also needs to return a value as well, you can chain +`SetArgPointee()` with `Return()` using `DoAll()`, remembering to put the +`Return()` statement last: + +```cpp +using ::testing::_; +using ::testing::Return; +using ::testing::SetArgPointee; + +class MockMutator : public Mutator { + public: + ... + MOCK_METHOD(bool, MutateInt, (int* value), (override)); +} +... + MockMutator mutator; + EXPECT_CALL(mutator, MutateInt(_)) + .WillOnce(DoAll(SetArgPointee<0>(5), + Return(true))); +``` + +Note, however, that if you use the `ReturnOKWith()` method, it will override the +values provided by `SetArgPointee()` in the response parameters of your function +call. + +If the output argument is an array, use the `SetArrayArgument(first, last)` +action instead. It copies the elements in source range `[first, last)` to the +array pointed to by the `N`-th (0-based) argument: + +```cpp +using ::testing::NotNull; +using ::testing::SetArrayArgument; + +class MockArrayMutator : public ArrayMutator { + public: + MOCK_METHOD(void, Mutate, (int* values, int num_values), (override)); + ... +} +... + MockArrayMutator mutator; + int values[5] = {1, 2, 3, 4, 5}; + EXPECT_CALL(mutator, Mutate(NotNull(), 5)) + .WillOnce(SetArrayArgument<0>(values, values + 5)); +``` + +This also works when the argument is an output iterator: + +```cpp +using ::testing::_; +using ::testing::SetArrayArgument; + +class MockRolodex : public Rolodex { + public: + MOCK_METHOD(void, GetNames, (std::back_insert_iterator>), + (override)); + ... +} +... + MockRolodex rolodex; + vector names; + names.push_back("George"); + names.push_back("John"); + names.push_back("Thomas"); + EXPECT_CALL(rolodex, GetNames(_)) + .WillOnce(SetArrayArgument<0>(names.begin(), names.end())); +``` + +### Changing a Mock Object's Behavior Based on the State + +If you expect a call to change the behavior of a mock object, you can use +`::testing::InSequence` to specify different behaviors before and after the +call: + +```cpp +using ::testing::InSequence; +using ::testing::Return; + +... + { + InSequence seq; + EXPECT_CALL(my_mock, IsDirty()) + .WillRepeatedly(Return(true)); + EXPECT_CALL(my_mock, Flush()); + EXPECT_CALL(my_mock, IsDirty()) + .WillRepeatedly(Return(false)); + } + my_mock.FlushIfDirty(); +``` + +This makes `my_mock.IsDirty()` return `true` before `my_mock.Flush()` is called +and return `false` afterwards. + +If the behavior change is more complex, you can store the effects in a variable +and make a mock method get its return value from that variable: + +```cpp +using ::testing::_; +using ::testing::SaveArg; +using ::testing::Return; + +ACTION_P(ReturnPointee, p) { return *p; } +... + int previous_value = 0; + EXPECT_CALL(my_mock, GetPrevValue) + .WillRepeatedly(ReturnPointee(&previous_value)); + EXPECT_CALL(my_mock, UpdateValue) + .WillRepeatedly(SaveArg<0>(&previous_value)); + my_mock.DoSomethingToUpdateValue(); +``` + +Here `my_mock.GetPrevValue()` will always return the argument of the last +`UpdateValue()` call. + +### Setting the Default Value for a Return Type {#DefaultValue} + +If a mock method's return type is a built-in C++ type or pointer, by default it +will return 0 when invoked. Also, in C++ 11 and above, a mock method whose +return type has a default constructor will return a default-constructed value by +default. You only need to specify an action if this default value doesn't work +for you. + +Sometimes, you may want to change this default value, or you may want to specify +a default value for types gMock doesn't know about. You can do this using the +`::testing::DefaultValue` class template: + +```cpp +using ::testing::DefaultValue; + +class MockFoo : public Foo { + public: + MOCK_METHOD(Bar, CalculateBar, (), (override)); +}; + + +... + Bar default_bar; + // Sets the default return value for type Bar. + DefaultValue::Set(default_bar); + + MockFoo foo; + + // We don't need to specify an action here, as the default + // return value works for us. + EXPECT_CALL(foo, CalculateBar()); + + foo.CalculateBar(); // This should return default_bar. + + // Unsets the default return value. + DefaultValue::Clear(); +``` + +Please note that changing the default value for a type can make you tests hard +to understand. We recommend you to use this feature judiciously. For example, +you may want to make sure the `Set()` and `Clear()` calls are right next to the +code that uses your mock. + +### Setting the Default Actions for a Mock Method + +You've learned how to change the default value of a given type. However, this +may be too coarse for your purpose: perhaps you have two mock methods with the +same return type and you want them to have different behaviors. The `ON_CALL()` +macro allows you to customize your mock's behavior at the method level: + +```cpp +using ::testing::_; +using ::testing::AnyNumber; +using ::testing::Gt; +using ::testing::Return; +... + ON_CALL(foo, Sign(_)) + .WillByDefault(Return(-1)); + ON_CALL(foo, Sign(0)) + .WillByDefault(Return(0)); + ON_CALL(foo, Sign(Gt(0))) + .WillByDefault(Return(1)); + + EXPECT_CALL(foo, Sign(_)) + .Times(AnyNumber()); + + foo.Sign(5); // This should return 1. + foo.Sign(-9); // This should return -1. + foo.Sign(0); // This should return 0. +``` + +As you may have guessed, when there are more than one `ON_CALL()` statements, +the newer ones in the order take precedence over the older ones. In other words, +the **last** one that matches the function arguments will be used. This matching +order allows you to set up the common behavior in a mock object's constructor or +the test fixture's set-up phase and specialize the mock's behavior later. + +Note that both `ON_CALL` and `EXPECT_CALL` have the same "later statements take +precedence" rule, but they don't interact. That is, `EXPECT_CALL`s have their +own precedence order distinct from the `ON_CALL` precedence order. + +### Using Functions/Methods/Functors/Lambdas as Actions {#FunctionsAsActions} + +If the built-in actions don't suit you, you can use an existing callable +(function, `std::function`, method, functor, lambda) as an action. + + + +```cpp +using ::testing::_; using ::testing::Invoke; + +class MockFoo : public Foo { + public: + MOCK_METHOD(int, Sum, (int x, int y), (override)); + MOCK_METHOD(bool, ComplexJob, (int x), (override)); +}; + +int CalculateSum(int x, int y) { return x + y; } +int Sum3(int x, int y, int z) { return x + y + z; } + +class Helper { + public: + bool ComplexJob(int x); +}; + +... + MockFoo foo; + Helper helper; + EXPECT_CALL(foo, Sum(_, _)) + .WillOnce(&CalculateSum) + .WillRepeatedly(Invoke(NewPermanentCallback(Sum3, 1))); + EXPECT_CALL(foo, ComplexJob(_)) + .WillOnce(Invoke(&helper, &Helper::ComplexJob)) + .WillOnce([] { return true; }) + .WillRepeatedly([](int x) { return x > 0; }); + + foo.Sum(5, 6); // Invokes CalculateSum(5, 6). + foo.Sum(2, 3); // Invokes Sum3(1, 2, 3). + foo.ComplexJob(10); // Invokes helper.ComplexJob(10). + foo.ComplexJob(-1); // Invokes the inline lambda. +``` + +The only requirement is that the type of the function, etc must be *compatible* +with the signature of the mock function, meaning that the latter's arguments (if +it takes any) can be implicitly converted to the corresponding arguments of the +former, and the former's return type can be implicitly converted to that of the +latter. So, you can invoke something whose type is *not* exactly the same as the +mock function, as long as it's safe to do so - nice, huh? + +**`Note:`{.escaped}** + +* The action takes ownership of the callback and will delete it when the + action itself is destructed. +* If the type of a callback is derived from a base callback type `C`, you need + to implicitly cast it to `C` to resolve the overloading, e.g. + + ```cpp + using ::testing::Invoke; + ... + ResultCallback* is_ok = ...; + ... Invoke(is_ok) ...; // This works. + + BlockingClosure* done = new BlockingClosure; + ... Invoke(implicit_cast(done)) ...; // The cast is necessary. + ``` + +### Using Functions with Extra Info as Actions + +The function or functor you call using `Invoke()` must have the same number of +arguments as the mock function you use it for. Sometimes you may have a function +that takes more arguments, and you are willing to pass in the extra arguments +yourself to fill the gap. You can do this in gMock using callbacks with +pre-bound arguments. Here's an example: + +```cpp +using ::testing::Invoke; + +class MockFoo : public Foo { + public: + MOCK_METHOD(char, DoThis, (int n), (override)); +}; + +char SignOfSum(int x, int y) { + const int sum = x + y; + return (sum > 0) ? '+' : (sum < 0) ? '-' : '0'; +} + +TEST_F(FooTest, Test) { + MockFoo foo; + + EXPECT_CALL(foo, DoThis(2)) + .WillOnce(Invoke(NewPermanentCallback(SignOfSum, 5))); + EXPECT_EQ('+', foo.DoThis(2)); // Invokes SignOfSum(5, 2). +} +``` + +### Invoking a Function/Method/Functor/Lambda/Callback Without Arguments + +`Invoke()` passes the mock function's arguments to the function, etc being +invoked such that the callee has the full context of the call to work with. If +the invoked function is not interested in some or all of the arguments, it can +simply ignore them. + +Yet, a common pattern is that a test author wants to invoke a function without +the arguments of the mock function. She could do that using a wrapper function +that throws away the arguments before invoking an underlining nullary function. +Needless to say, this can be tedious and obscures the intent of the test. + +There are two solutions to this problem. First, you can pass any callable of +zero args as an action. Alternatively, use `InvokeWithoutArgs()`, which is like +`Invoke()` except that it doesn't pass the mock function's arguments to the +callee. Here's an example of each: + +```cpp +using ::testing::_; +using ::testing::InvokeWithoutArgs; + +class MockFoo : public Foo { + public: + MOCK_METHOD(bool, ComplexJob, (int n), (override)); +}; + +bool Job1() { ... } +bool Job2(int n, char c) { ... } + +... + MockFoo foo; + EXPECT_CALL(foo, ComplexJob(_)) + .WillOnce([] { Job1(); }); + .WillOnce(InvokeWithoutArgs(NewPermanentCallback(Job2, 5, 'a'))); + + foo.ComplexJob(10); // Invokes Job1(). + foo.ComplexJob(20); // Invokes Job2(5, 'a'). +``` + +**`Note:`{.escaped}** + +* The action takes ownership of the callback and will delete it when the + action itself is destructed. +* If the type of a callback is derived from a base callback type `C`, you need + to implicitly cast it to `C` to resolve the overloading, e.g. + + ```cpp + using ::testing::InvokeWithoutArgs; + ... + ResultCallback* is_ok = ...; + ... InvokeWithoutArgs(is_ok) ...; // This works. + + BlockingClosure* done = ...; + ... InvokeWithoutArgs(implicit_cast(done)) ...; + // The cast is necessary. + ``` + +### Invoking an Argument of the Mock Function + +Sometimes a mock function will receive a function pointer, a functor (in other +words, a "callable") as an argument, e.g. + +```cpp +class MockFoo : public Foo { + public: + MOCK_METHOD(bool, DoThis, (int n, (ResultCallback1* callback)), + (override)); +}; +``` + +and you may want to invoke this callable argument: + +```cpp +using ::testing::_; +... + MockFoo foo; + EXPECT_CALL(foo, DoThis(_, _)) + .WillOnce(...); + // Will execute callback->Run(5), where callback is the + // second argument DoThis() receives. +``` + +NOTE: The section below is legacy documentation from before C++ had lambdas: + +Arghh, you need to refer to a mock function argument but C++ has no lambda +(yet), so you have to define your own action. :-( Or do you really? + +Well, gMock has an action to solve *exactly* this problem: + +```cpp +InvokeArgument(arg_1, arg_2, ..., arg_m) +``` + +will invoke the `N`-th (0-based) argument the mock function receives, with +`arg_1`, `arg_2`, ..., and `arg_m`. No matter if the argument is a function +pointer, a functor, or a callback. gMock handles them all. + +With that, you could write: + +```cpp +using ::testing::_; +using ::testing::InvokeArgument; +... + EXPECT_CALL(foo, DoThis(_, _)) + .WillOnce(InvokeArgument<1>(5)); + // Will execute callback->Run(5), where callback is the + // second argument DoThis() receives. +``` + +What if the callable takes an argument by reference? No problem - just wrap it +inside `ByRef()`: + +```cpp + ... + MOCK_METHOD(bool, Bar, + ((ResultCallback2* callback)), + (override)); + ... + using ::testing::_; + using ::testing::ByRef; + using ::testing::InvokeArgument; + ... + MockFoo foo; + Helper helper; + ... + EXPECT_CALL(foo, Bar(_)) + .WillOnce(InvokeArgument<0>(5, ByRef(helper))); + // ByRef(helper) guarantees that a reference to helper, not a copy of it, + // will be passed to the callback. +``` + +What if the callable takes an argument by reference and we do **not** wrap the +argument in `ByRef()`? Then `InvokeArgument()` will *make a copy* of the +argument, and pass a *reference to the copy*, instead of a reference to the +original value, to the callable. This is especially handy when the argument is a +temporary value: + +```cpp + ... + MOCK_METHOD(bool, DoThat, (bool (*f)(const double& x, const string& s)), + (override)); + ... + using ::testing::_; + using ::testing::InvokeArgument; + ... + MockFoo foo; + ... + EXPECT_CALL(foo, DoThat(_)) + .WillOnce(InvokeArgument<0>(5.0, string("Hi"))); + // Will execute (*f)(5.0, string("Hi")), where f is the function pointer + // DoThat() receives. Note that the values 5.0 and string("Hi") are + // temporary and dead once the EXPECT_CALL() statement finishes. Yet + // it's fine to perform this action later, since a copy of the values + // are kept inside the InvokeArgument action. +``` + +### Ignoring an Action's Result + +Sometimes you have an action that returns *something*, but you need an action +that returns `void` (perhaps you want to use it in a mock function that returns +`void`, or perhaps it needs to be used in `DoAll()` and it's not the last in the +list). `IgnoreResult()` lets you do that. For example: + +```cpp +using ::testing::_; +using ::testing::DoAll; +using ::testing::IgnoreResult; +using ::testing::Return; + +int Process(const MyData& data); +string DoSomething(); + +class MockFoo : public Foo { + public: + MOCK_METHOD(void, Abc, (const MyData& data), (override)); + MOCK_METHOD(bool, Xyz, (), (override)); +}; + + ... + MockFoo foo; + EXPECT_CALL(foo, Abc(_)) + // .WillOnce(Invoke(Process)); + // The above line won't compile as Process() returns int but Abc() needs + // to return void. + .WillOnce(IgnoreResult(Process)); + EXPECT_CALL(foo, Xyz()) + .WillOnce(DoAll(IgnoreResult(DoSomething), + // Ignores the string DoSomething() returns. + Return(true))); +``` + +Note that you **cannot** use `IgnoreResult()` on an action that already returns +`void`. Doing so will lead to ugly compiler errors. + +### Selecting an Action's Arguments {#SelectingArgs} + +Say you have a mock function `Foo()` that takes seven arguments, and you have a +custom action that you want to invoke when `Foo()` is called. Trouble is, the +custom action only wants three arguments: + +```cpp +using ::testing::_; +using ::testing::Invoke; +... + MOCK_METHOD(bool, Foo, + (bool visible, const string& name, int x, int y, + (const map>), double& weight, double min_weight, + double max_wight)); +... +bool IsVisibleInQuadrant1(bool visible, int x, int y) { + return visible && x >= 0 && y >= 0; +} +... + EXPECT_CALL(mock, Foo) + .WillOnce(Invoke(IsVisibleInQuadrant1)); // Uh, won't compile. :-( +``` + +To please the compiler God, you need to define an "adaptor" that has the same +signature as `Foo()` and calls the custom action with the right arguments: + +```cpp +using ::testing::_; +using ::testing::Invoke; +... +bool MyIsVisibleInQuadrant1(bool visible, const string& name, int x, int y, + const map, double>& weight, + double min_weight, double max_wight) { + return IsVisibleInQuadrant1(visible, x, y); +} +... + EXPECT_CALL(mock, Foo) + .WillOnce(Invoke(MyIsVisibleInQuadrant1)); // Now it works. +``` + +But isn't this awkward? + +gMock provides a generic *action adaptor*, so you can spend your time minding +more important business than writing your own adaptors. Here's the syntax: + +```cpp +WithArgs(action) +``` + +creates an action that passes the arguments of the mock function at the given +indices (0-based) to the inner `action` and performs it. Using `WithArgs`, our +original example can be written as: + +```cpp +using ::testing::_; +using ::testing::Invoke; +using ::testing::WithArgs; +... + EXPECT_CALL(mock, Foo) + .WillOnce(WithArgs<0, 2, 3>(Invoke(IsVisibleInQuadrant1))); // No need to define your own adaptor. +``` + +For better readability, gMock also gives you: + +* `WithoutArgs(action)` when the inner `action` takes *no* argument, and +* `WithArg(action)` (no `s` after `Arg`) when the inner `action` takes + *one* argument. + +As you may have realized, `InvokeWithoutArgs(...)` is just syntactic sugar for +`WithoutArgs(Invoke(...))`. + +Here are more tips: + +* The inner action used in `WithArgs` and friends does not have to be + `Invoke()` -- it can be anything. +* You can repeat an argument in the argument list if necessary, e.g. + `WithArgs<2, 3, 3, 5>(...)`. +* You can change the order of the arguments, e.g. `WithArgs<3, 2, 1>(...)`. +* The types of the selected arguments do *not* have to match the signature of + the inner action exactly. It works as long as they can be implicitly + converted to the corresponding arguments of the inner action. For example, + if the 4-th argument of the mock function is an `int` and `my_action` takes + a `double`, `WithArg<4>(my_action)` will work. + +### Ignoring Arguments in Action Functions + +The [selecting-an-action's-arguments](#SelectingArgs) recipe showed us one way +to make a mock function and an action with incompatible argument lists fit +together. The downside is that wrapping the action in `WithArgs<...>()` can get +tedious for people writing the tests. + +If you are defining a function (or method, functor, lambda, callback) to be used +with `Invoke*()`, and you are not interested in some of its arguments, an +alternative to `WithArgs` is to declare the uninteresting arguments as `Unused`. +This makes the definition less cluttered and less fragile in case the types of +the uninteresting arguments change. It could also increase the chance the action +function can be reused. For example, given + +```cpp + public: + MOCK_METHOD(double, Foo, double(const string& label, double x, double y), + (override)); + MOCK_METHOD(double, Bar, (int index, double x, double y), (override)); +``` + +instead of + +```cpp +using ::testing::_; +using ::testing::Invoke; + +double DistanceToOriginWithLabel(const string& label, double x, double y) { + return sqrt(x*x + y*y); +} +double DistanceToOriginWithIndex(int index, double x, double y) { + return sqrt(x*x + y*y); +} +... + EXPECT_CALL(mock, Foo("abc", _, _)) + .WillOnce(Invoke(DistanceToOriginWithLabel)); + EXPECT_CALL(mock, Bar(5, _, _)) + .WillOnce(Invoke(DistanceToOriginWithIndex)); +``` + +you could write + +```cpp +using ::testing::_; +using ::testing::Invoke; +using ::testing::Unused; + +double DistanceToOrigin(Unused, double x, double y) { + return sqrt(x*x + y*y); +} +... + EXPECT_CALL(mock, Foo("abc", _, _)) + .WillOnce(Invoke(DistanceToOrigin)); + EXPECT_CALL(mock, Bar(5, _, _)) + .WillOnce(Invoke(DistanceToOrigin)); +``` + +### Sharing Actions + +Just like matchers, a gMock action object consists of a pointer to a ref-counted +implementation object. Therefore copying actions is also allowed and very +efficient. When the last action that references the implementation object dies, +the implementation object will be deleted. + +If you have some complex action that you want to use again and again, you may +not have to build it from scratch everytime. If the action doesn't have an +internal state (i.e. if it always does the same thing no matter how many times +it has been called), you can assign it to an action variable and use that +variable repeatedly. For example: + +```cpp +using ::testing::Action; +using ::testing::DoAll; +using ::testing::Return; +using ::testing::SetArgPointee; +... + Action set_flag = DoAll(SetArgPointee<0>(5), + Return(true)); + ... use set_flag in .WillOnce() and .WillRepeatedly() ... +``` + +However, if the action has its own state, you may be surprised if you share the +action object. Suppose you have an action factory `IncrementCounter(init)` which +creates an action that increments and returns a counter whose initial value is +`init`, using two actions created from the same expression and using a shared +action will exhibit different behaviors. Example: + +```cpp + EXPECT_CALL(foo, DoThis()) + .WillRepeatedly(IncrementCounter(0)); + EXPECT_CALL(foo, DoThat()) + .WillRepeatedly(IncrementCounter(0)); + foo.DoThis(); // Returns 1. + foo.DoThis(); // Returns 2. + foo.DoThat(); // Returns 1 - Blah() uses a different + // counter than Bar()'s. +``` + +versus + +```cpp +using ::testing::Action; +... + Action increment = IncrementCounter(0); + EXPECT_CALL(foo, DoThis()) + .WillRepeatedly(increment); + EXPECT_CALL(foo, DoThat()) + .WillRepeatedly(increment); + foo.DoThis(); // Returns 1. + foo.DoThis(); // Returns 2. + foo.DoThat(); // Returns 3 - the counter is shared. +``` + +### Testing Asynchronous Behavior + +One oft-encountered problem with gMock is that it can be hard to test +asynchronous behavior. Suppose you had a `EventQueue` class that you wanted to +test, and you created a separate `EventDispatcher` interface so that you could +easily mock it out. However, the implementation of the class fired all the +events on a background thread, which made test timings difficult. You could just +insert `sleep()` statements and hope for the best, but that makes your test +behavior nondeterministic. A better way is to use gMock actions and +`Notification` objects to force your asynchronous test to behave synchronously. + +```cpp +using ::testing::DoAll; +using ::testing::InvokeWithoutArgs; +using ::testing::Return; + +class MockEventDispatcher : public EventDispatcher { + MOCK_METHOD(bool, DispatchEvent, (int32), (override)); +}; + +ACTION_P(Notify, notification) { + notification->Notify(); +} + +TEST(EventQueueTest, EnqueueEventTest) { + MockEventDispatcher mock_event_dispatcher; + EventQueue event_queue(&mock_event_dispatcher); + + const int32 kEventId = 321; + Notification done; + EXPECT_CALL(mock_event_dispatcher, DispatchEvent(kEventId)) + .WillOnce(Notify(&done)); + + event_queue.EnqueueEvent(kEventId); + done.WaitForNotification(); +} +``` + +In the example above, we set our normal gMock expectations, but then add an +additional action to notify the `Notification` object. Now we can just call +`Notification::WaitForNotification()` in the main thread to wait for the +asynchronous call to finish. After that, our test suite is complete and we can +safely exit. + +Note: this example has a downside: namely, if the expectation is not satisfied, +our test will run forever. It will eventually time-out and fail, but it will +take longer and be slightly harder to debug. To alleviate this problem, you can +use `WaitForNotificationWithTimeout(ms)` instead of `WaitForNotification()`. + +## Misc Recipes on Using gMock + +### Mocking Methods That Use Move-Only Types + +C++11 introduced *move-only types*. A move-only-typed value can be moved from +one object to another, but cannot be copied. `std::unique_ptr` is probably +the most commonly used move-only type. + +Mocking a method that takes and/or returns move-only types presents some +challenges, but nothing insurmountable. This recipe shows you how you can do it. +Note that the support for move-only method arguments was only introduced to +gMock in April 2017; in older code, you may find more complex +[workarounds](#LegacyMoveOnly) for lack of this feature. + +Let’s say we are working on a fictional project that lets one post and share +snippets called “buzzes”. Your code uses these types: + +```cpp +enum class AccessLevel { kInternal, kPublic }; + +class Buzz { + public: + explicit Buzz(AccessLevel access) { ... } + ... +}; + +class Buzzer { + public: + virtual ~Buzzer() {} + virtual std::unique_ptr MakeBuzz(StringPiece text) = 0; + virtual bool ShareBuzz(std::unique_ptr buzz, int64_t timestamp) = 0; + ... +}; +``` + +A `Buzz` object represents a snippet being posted. A class that implements the +`Buzzer` interface is capable of creating and sharing `Buzz`es. Methods in +`Buzzer` may return a `unique_ptr` or take a `unique_ptr`. Now we +need to mock `Buzzer` in our tests. + +To mock a method that accepts or returns move-only types, you just use the +familiar `MOCK_METHOD` syntax as usual: + +```cpp +class MockBuzzer : public Buzzer { + public: + MOCK_METHOD(std::unique_ptr, MakeBuzz, (StringPiece text), (override)); + MOCK_METHOD(bool, ShareBuzz, (std::unique_ptr buzz, int64_t timestamp), + (override)); +}; +``` + +Now that we have the mock class defined, we can use it in tests. In the +following code examples, we assume that we have defined a `MockBuzzer` object +named `mock_buzzer_`: + +```cpp + MockBuzzer mock_buzzer_; +``` + +First let’s see how we can set expectations on the `MakeBuzz()` method, which +returns a `unique_ptr`. + +As usual, if you set an expectation without an action (i.e. the `.WillOnce()` or +`.WillRepeatedly()` clause), when that expectation fires, the default action for +that method will be taken. Since `unique_ptr<>` has a default constructor that +returns a null `unique_ptr`, that’s what you’ll get if you don’t specify an +action: + +```cpp + // Use the default action. + EXPECT_CALL(mock_buzzer_, MakeBuzz("hello")); + + // Triggers the previous EXPECT_CALL. + EXPECT_EQ(nullptr, mock_buzzer_.MakeBuzz("hello")); +``` + +If you are not happy with the default action, you can tweak it as usual; see +[Setting Default Actions](#OnCall). + +If you just need to return a pre-defined move-only value, you can use the +`Return(ByMove(...))` action: + +```cpp + // When this fires, the unique_ptr<> specified by ByMove(...) will + // be returned. + EXPECT_CALL(mock_buzzer_, MakeBuzz("world")) + .WillOnce(Return(ByMove(MakeUnique(AccessLevel::kInternal)))); + + EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("world")); +``` + +Note that `ByMove()` is essential here - if you drop it, the code won’t compile. + +Quiz time! What do you think will happen if a `Return(ByMove(...))` action is +performed more than once (e.g. you write `... +.WillRepeatedly(Return(ByMove(...)));`)? Come think of it, after the first time +the action runs, the source value will be consumed (since it’s a move-only +value), so the next time around, there’s no value to move from -- you’ll get a +run-time error that `Return(ByMove(...))` can only be run once. + +If you need your mock method to do more than just moving a pre-defined value, +remember that you can always use a lambda or a callable object, which can do +pretty much anything you want: + +```cpp + EXPECT_CALL(mock_buzzer_, MakeBuzz("x")) + .WillRepeatedly([](StringPiece text) { + return MakeUnique(AccessLevel::kInternal); + }); + + EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("x")); + EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("x")); +``` + +Every time this `EXPECT_CALL` fires, a new `unique_ptr` will be created +and returned. You cannot do this with `Return(ByMove(...))`. + +That covers returning move-only values; but how do we work with methods +accepting move-only arguments? The answer is that they work normally, although +some actions will not compile when any of method's arguments are move-only. You +can always use `Return`, or a [lambda or functor](#FunctionsAsActions): + +```cpp + using ::testing::Unused; + + EXPECT_CALL(mock_buzzer_, ShareBuzz(NotNull(), _)).WillOnce(Return(true)); + EXPECT_TRUE(mock_buzzer_.ShareBuzz(MakeUnique(AccessLevel::kInternal)), + 0); + + EXPECT_CALL(mock_buzzer_, ShareBuzz(_, _)).WillOnce( + [](std::unique_ptr buzz, Unused) { return buzz != nullptr; }); + EXPECT_FALSE(mock_buzzer_.ShareBuzz(nullptr, 0)); +``` + +Many built-in actions (`WithArgs`, `WithoutArgs`,`DeleteArg`, `SaveArg`, ...) +could in principle support move-only arguments, but the support for this is not +implemented yet. If this is blocking you, please file a bug. + +A few actions (e.g. `DoAll`) copy their arguments internally, so they can never +work with non-copyable objects; you'll have to use functors instead. + +#### Legacy workarounds for move-only types {#LegacyMoveOnly} + +Support for move-only function arguments was only introduced to gMock in April +2017. In older code, you may encounter the following workaround for the lack of +this feature (it is no longer necessary - we're including it just for +reference): + +```cpp +class MockBuzzer : public Buzzer { + public: + MOCK_METHOD(bool, DoShareBuzz, (Buzz* buzz, Time timestamp)); + bool ShareBuzz(std::unique_ptr buzz, Time timestamp) override { + return DoShareBuzz(buzz.get(), timestamp); + } +}; +``` + +The trick is to delegate the `ShareBuzz()` method to a mock method (let’s call +it `DoShareBuzz()`) that does not take move-only parameters. Then, instead of +setting expectations on `ShareBuzz()`, you set them on the `DoShareBuzz()` mock +method: + +```cpp + MockBuzzer mock_buzzer_; + EXPECT_CALL(mock_buzzer_, DoShareBuzz(NotNull(), _)); + + // When one calls ShareBuzz() on the MockBuzzer like this, the call is + // forwarded to DoShareBuzz(), which is mocked. Therefore this statement + // will trigger the above EXPECT_CALL. + mock_buzzer_.ShareBuzz(MakeUnique(AccessLevel::kInternal), 0); +``` + +### Making the Compilation Faster + +Believe it or not, the *vast majority* of the time spent on compiling a mock +class is in generating its constructor and destructor, as they perform +non-trivial tasks (e.g. verification of the expectations). What's more, mock +methods with different signatures have different types and thus their +constructors/destructors need to be generated by the compiler separately. As a +result, if you mock many different types of methods, compiling your mock class +can get really slow. + +If you are experiencing slow compilation, you can move the definition of your +mock class' constructor and destructor out of the class body and into a `.cc` +file. This way, even if you `#include` your mock class in N files, the compiler +only needs to generate its constructor and destructor once, resulting in a much +faster compilation. + +Let's illustrate the idea using an example. Here's the definition of a mock +class before applying this recipe: + +```cpp +// File mock_foo.h. +... +class MockFoo : public Foo { + public: + // Since we don't declare the constructor or the destructor, + // the compiler will generate them in every translation unit + // where this mock class is used. + + MOCK_METHOD(int, DoThis, (), (override)); + MOCK_METHOD(bool, DoThat, (const char* str), (override)); + ... more mock methods ... +}; +``` + +After the change, it would look like: + +```cpp +// File mock_foo.h. +... +class MockFoo : public Foo { + public: + // The constructor and destructor are declared, but not defined, here. + MockFoo(); + virtual ~MockFoo(); + + MOCK_METHOD(int, DoThis, (), (override)); + MOCK_METHOD(bool, DoThat, (const char* str), (override)); + ... more mock methods ... +}; +``` + +and + +```cpp +// File mock_foo.cc. +#include "path/to/mock_foo.h" + +// The definitions may appear trivial, but the functions actually do a +// lot of things through the constructors/destructors of the member +// variables used to implement the mock methods. +MockFoo::MockFoo() {} +MockFoo::~MockFoo() {} +``` + +### Forcing a Verification + +When it's being destroyed, your friendly mock object will automatically verify +that all expectations on it have been satisfied, and will generate googletest +failures if not. This is convenient as it leaves you with one less thing to +worry about. That is, unless you are not sure if your mock object will be +destroyed. + +How could it be that your mock object won't eventually be destroyed? Well, it +might be created on the heap and owned by the code you are testing. Suppose +there's a bug in that code and it doesn't delete the mock object properly - you +could end up with a passing test when there's actually a bug. + +Using a heap checker is a good idea and can alleviate the concern, but its +implementation is not 100% reliable. So, sometimes you do want to *force* gMock +to verify a mock object before it is (hopefully) destructed. You can do this +with `Mock::VerifyAndClearExpectations(&mock_object)`: + +```cpp +TEST(MyServerTest, ProcessesRequest) { + using ::testing::Mock; + + MockFoo* const foo = new MockFoo; + EXPECT_CALL(*foo, ...)...; + // ... other expectations ... + + // server now owns foo. + MyServer server(foo); + server.ProcessRequest(...); + + // In case that server's destructor will forget to delete foo, + // this will verify the expectations anyway. + Mock::VerifyAndClearExpectations(foo); +} // server is destroyed when it goes out of scope here. +``` + +**Tip:** The `Mock::VerifyAndClearExpectations()` function returns a `bool` to +indicate whether the verification was successful (`true` for yes), so you can +wrap that function call inside a `ASSERT_TRUE()` if there is no point going +further when the verification has failed. + +### Using Check Points {#UsingCheckPoints} + +Sometimes you may want to "reset" a mock object at various check points in your +test: at each check point, you verify that all existing expectations on the mock +object have been satisfied, and then you set some new expectations on it as if +it's newly created. This allows you to work with a mock object in "phases" whose +sizes are each manageable. + +One such scenario is that in your test's `SetUp()` function, you may want to put +the object you are testing into a certain state, with the help from a mock +object. Once in the desired state, you want to clear all expectations on the +mock, such that in the `TEST_F` body you can set fresh expectations on it. + +As you may have figured out, the `Mock::VerifyAndClearExpectations()` function +we saw in the previous recipe can help you here. Or, if you are using +`ON_CALL()` to set default actions on the mock object and want to clear the +default actions as well, use `Mock::VerifyAndClear(&mock_object)` instead. This +function does what `Mock::VerifyAndClearExpectations(&mock_object)` does and +returns the same `bool`, **plus** it clears the `ON_CALL()` statements on +`mock_object` too. + +Another trick you can use to achieve the same effect is to put the expectations +in sequences and insert calls to a dummy "check-point" function at specific +places. Then you can verify that the mock function calls do happen at the right +time. For example, if you are exercising code: + +```cpp + Foo(1); + Foo(2); + Foo(3); +``` + +and want to verify that `Foo(1)` and `Foo(3)` both invoke `mock.Bar("a")`, but +`Foo(2)` doesn't invoke anything. You can write: + +```cpp +using ::testing::MockFunction; + +TEST(FooTest, InvokesBarCorrectly) { + MyMock mock; + // Class MockFunction has exactly one mock method. It is named + // Call() and has type F. + MockFunction check; + { + InSequence s; + + EXPECT_CALL(mock, Bar("a")); + EXPECT_CALL(check, Call("1")); + EXPECT_CALL(check, Call("2")); + EXPECT_CALL(mock, Bar("a")); + } + Foo(1); + check.Call("1"); + Foo(2); + check.Call("2"); + Foo(3); +} +``` + +The expectation spec says that the first `Bar("a")` must happen before check +point "1", the second `Bar("a")` must happen after check point "2", and nothing +should happen between the two check points. The explicit check points make it +easy to tell which `Bar("a")` is called by which call to `Foo()`. + +### Mocking Destructors + +Sometimes you want to make sure a mock object is destructed at the right time, +e.g. after `bar->A()` is called but before `bar->B()` is called. We already know +that you can specify constraints on the [order](#OrderedCalls) of mock function +calls, so all we need to do is to mock the destructor of the mock function. + +This sounds simple, except for one problem: a destructor is a special function +with special syntax and special semantics, and the `MOCK_METHOD` macro doesn't +work for it: + +```cpp +MOCK_METHOD(void, ~MockFoo, ()); // Won't compile! +``` + +The good news is that you can use a simple pattern to achieve the same effect. +First, add a mock function `Die()` to your mock class and call it in the +destructor, like this: + +```cpp +class MockFoo : public Foo { + ... + // Add the following two lines to the mock class. + MOCK_METHOD(void, Die, ()); + virtual ~MockFoo() { Die(); } +}; +``` + +(If the name `Die()` clashes with an existing symbol, choose another name.) Now, +we have translated the problem of testing when a `MockFoo` object dies to +testing when its `Die()` method is called: + +```cpp + MockFoo* foo = new MockFoo; + MockBar* bar = new MockBar; + ... + { + InSequence s; + + // Expects *foo to die after bar->A() and before bar->B(). + EXPECT_CALL(*bar, A()); + EXPECT_CALL(*foo, Die()); + EXPECT_CALL(*bar, B()); + } +``` + +And that's that. + +### Using gMock and Threads {#UsingThreads} + +In a **unit** test, it's best if you could isolate and test a piece of code in a +single-threaded context. That avoids race conditions and dead locks, and makes +debugging your test much easier. + +Yet most programs are multi-threaded, and sometimes to test something we need to +pound on it from more than one thread. gMock works for this purpose too. + +Remember the steps for using a mock: + +1. Create a mock object `foo`. +2. Set its default actions and expectations using `ON_CALL()` and + `EXPECT_CALL()`. +3. The code under test calls methods of `foo`. +4. Optionally, verify and reset the mock. +5. Destroy the mock yourself, or let the code under test destroy it. The + destructor will automatically verify it. + +If you follow the following simple rules, your mocks and threads can live +happily together: + +* Execute your *test code* (as opposed to the code being tested) in *one* + thread. This makes your test easy to follow. +* Obviously, you can do step #1 without locking. +* When doing step #2 and #5, make sure no other thread is accessing `foo`. + Obvious too, huh? +* #3 and #4 can be done either in one thread or in multiple threads - anyway + you want. gMock takes care of the locking, so you don't have to do any - + unless required by your test logic. + +If you violate the rules (for example, if you set expectations on a mock while +another thread is calling its methods), you get undefined behavior. That's not +fun, so don't do it. + +gMock guarantees that the action for a mock function is done in the same thread +that called the mock function. For example, in + +```cpp + EXPECT_CALL(mock, Foo(1)) + .WillOnce(action1); + EXPECT_CALL(mock, Foo(2)) + .WillOnce(action2); +``` + +if `Foo(1)` is called in thread 1 and `Foo(2)` is called in thread 2, gMock will +execute `action1` in thread 1 and `action2` in thread 2. + +gMock does *not* impose a sequence on actions performed in different threads +(doing so may create deadlocks as the actions may need to cooperate). This means +that the execution of `action1` and `action2` in the above example *may* +interleave. If this is a problem, you should add proper synchronization logic to +`action1` and `action2` to make the test thread-safe. + +Also, remember that `DefaultValue` is a global resource that potentially +affects *all* living mock objects in your program. Naturally, you won't want to +mess with it from multiple threads or when there still are mocks in action. + +### Controlling How Much Information gMock Prints + +When gMock sees something that has the potential of being an error (e.g. a mock +function with no expectation is called, a.k.a. an uninteresting call, which is +allowed but perhaps you forgot to explicitly ban the call), it prints some +warning messages, including the arguments of the function, the return value, and +the stack trace. Hopefully this will remind you to take a look and see if there +is indeed a problem. + +Sometimes you are confident that your tests are correct and may not appreciate +such friendly messages. Some other times, you are debugging your tests or +learning about the behavior of the code you are testing, and wish you could +observe every mock call that happens (including argument values, the return +value, and the stack trace). Clearly, one size doesn't fit all. + +You can control how much gMock tells you using the `--gmock_verbose=LEVEL` +command-line flag, where `LEVEL` is a string with three possible values: + +* `info`: gMock will print all informational messages, warnings, and errors + (most verbose). At this setting, gMock will also log any calls to the + `ON_CALL/EXPECT_CALL` macros. It will include a stack trace in + "uninteresting call" warnings. +* `warning`: gMock will print both warnings and errors (less verbose); it will + omit the stack traces in "uninteresting call" warnings. This is the default. +* `error`: gMock will print errors only (least verbose). + +Alternatively, you can adjust the value of that flag from within your tests like +so: + +```cpp + ::testing::FLAGS_gmock_verbose = "error"; +``` + +If you find gMock printing too many stack frames with its informational or +warning messages, remember that you can control their amount with the +`--gtest_stack_trace_depth=max_depth` flag. + +Now, judiciously use the right flag to enable gMock serve you better! + +### Gaining Super Vision into Mock Calls + +You have a test using gMock. It fails: gMock tells you some expectations aren't +satisfied. However, you aren't sure why: Is there a typo somewhere in the +matchers? Did you mess up the order of the `EXPECT_CALL`s? Or is the code under +test doing something wrong? How can you find out the cause? + +Won't it be nice if you have X-ray vision and can actually see the trace of all +`EXPECT_CALL`s and mock method calls as they are made? For each call, would you +like to see its actual argument values and which `EXPECT_CALL` gMock thinks it +matches? If you still need some help to figure out who made these calls, how +about being able to see the complete stack trace at each mock call? + +You can unlock this power by running your test with the `--gmock_verbose=info` +flag. For example, given the test program: + +```cpp +#include "gmock/gmock.h" + +using testing::_; +using testing::HasSubstr; +using testing::Return; + +class MockFoo { + public: + MOCK_METHOD(void, F, (const string& x, const string& y)); +}; + +TEST(Foo, Bar) { + MockFoo mock; + EXPECT_CALL(mock, F(_, _)).WillRepeatedly(Return()); + EXPECT_CALL(mock, F("a", "b")); + EXPECT_CALL(mock, F("c", HasSubstr("d"))); + + mock.F("a", "good"); + mock.F("a", "b"); +} +``` + +if you run it with `--gmock_verbose=info`, you will see this output: + +```shell +[ RUN ] Foo.Bar + +foo_test.cc:14: EXPECT_CALL(mock, F(_, _)) invoked +Stack trace: ... + +foo_test.cc:15: EXPECT_CALL(mock, F("a", "b")) invoked +Stack trace: ... + +foo_test.cc:16: EXPECT_CALL(mock, F("c", HasSubstr("d"))) invoked +Stack trace: ... + +foo_test.cc:14: Mock function call matches EXPECT_CALL(mock, F(_, _))... + Function call: F(@0x7fff7c8dad40"a",@0x7fff7c8dad10"good") +Stack trace: ... + +foo_test.cc:15: Mock function call matches EXPECT_CALL(mock, F("a", "b"))... + Function call: F(@0x7fff7c8dada0"a",@0x7fff7c8dad70"b") +Stack trace: ... + +foo_test.cc:16: Failure +Actual function call count doesn't match EXPECT_CALL(mock, F("c", HasSubstr("d")))... + Expected: to be called once + Actual: never called - unsatisfied and active +[ FAILED ] Foo.Bar +``` + +Suppose the bug is that the `"c"` in the third `EXPECT_CALL` is a typo and +should actually be `"a"`. With the above message, you should see that the actual +`F("a", "good")` call is matched by the first `EXPECT_CALL`, not the third as +you thought. From that it should be obvious that the third `EXPECT_CALL` is +written wrong. Case solved. + +If you are interested in the mock call trace but not the stack traces, you can +combine `--gmock_verbose=info` with `--gtest_stack_trace_depth=0` on the test +command line. + + + +### Running Tests in Emacs + +If you build and run your tests in Emacs using the `M-x google-compile` command +(as many googletest users do), the source file locations of gMock and googletest +errors will be highlighted. Just press `` on one of them and you'll be +taken to the offending line. Or, you can just type `C-x`` to jump to the next +error. + +To make it even easier, you can add the following lines to your `~/.emacs` file: + +```text +(global-set-key "\M-m" 'google-compile) ; m is for make +(global-set-key [M-down] 'next-error) +(global-set-key [M-up] '(lambda () (interactive) (next-error -1))) +``` + +Then you can type `M-m` to start a build (if you want to run the test as well, +just make sure `foo_test.run` or `runtests` is in the build command you supply +after typing `M-m`), or `M-up`/`M-down` to move back and forth between errors. + +## Extending gMock + +### Writing New Matchers Quickly {#NewMatchers} + +WARNING: gMock does not guarantee when or how many times a matcher will be +invoked. Therefore, all matchers must be functionally pure. See +[this section](#PureMatchers) for more details. + +The `MATCHER*` family of macros can be used to define custom matchers easily. +The syntax: + +```cpp +MATCHER(name, description_string_expression) { statements; } +``` + +will define a matcher with the given name that executes the statements, which +must return a `bool` to indicate if the match succeeds. Inside the statements, +you can refer to the value being matched by `arg`, and refer to its type by +`arg_type`. + +The *description string* is a `string`-typed expression that documents what the +matcher does, and is used to generate the failure message when the match fails. +It can (and should) reference the special `bool` variable `negation`, and should +evaluate to the description of the matcher when `negation` is `false`, or that +of the matcher's negation when `negation` is `true`. + +For convenience, we allow the description string to be empty (`""`), in which +case gMock will use the sequence of words in the matcher name as the +description. + +For example: + +```cpp +MATCHER(IsDivisibleBy7, "") { return (arg % 7) == 0; } +``` + +allows you to write + +```cpp + // Expects mock_foo.Bar(n) to be called where n is divisible by 7. + EXPECT_CALL(mock_foo, Bar(IsDivisibleBy7())); +``` + +or, + +```cpp + using ::testing::Not; + ... + // Verifies that two values are divisible by 7. + EXPECT_THAT(some_expression, IsDivisibleBy7()); + EXPECT_THAT(some_other_expression, Not(IsDivisibleBy7())); +``` + +If the above assertions fail, they will print something like: + +```shell + Value of: some_expression + Expected: is divisible by 7 + Actual: 27 + ... + Value of: some_other_expression + Expected: not (is divisible by 7) + Actual: 21 +``` + +where the descriptions `"is divisible by 7"` and `"not (is divisible by 7)"` are +automatically calculated from the matcher name `IsDivisibleBy7`. + +As you may have noticed, the auto-generated descriptions (especially those for +the negation) may not be so great. You can always override them with a `string` +expression of your own: + +```cpp +MATCHER(IsDivisibleBy7, + absl::StrCat(negation ? "isn't" : "is", " divisible by 7")) { + return (arg % 7) == 0; +} +``` + +Optionally, you can stream additional information to a hidden argument named +`result_listener` to explain the match result. For example, a better definition +of `IsDivisibleBy7` is: + +```cpp +MATCHER(IsDivisibleBy7, "") { + if ((arg % 7) == 0) + return true; + + *result_listener << "the remainder is " << (arg % 7); + return false; +} +``` + +With this definition, the above assertion will give a better message: + +```shell + Value of: some_expression + Expected: is divisible by 7 + Actual: 27 (the remainder is 6) +``` + +You should let `MatchAndExplain()` print *any additional information* that can +help a user understand the match result. Note that it should explain why the +match succeeds in case of a success (unless it's obvious) - this is useful when +the matcher is used inside `Not()`. There is no need to print the argument value +itself, as gMock already prints it for you. + +NOTE: The type of the value being matched (`arg_type`) is determined by the +context in which you use the matcher and is supplied to you by the compiler, so +you don't need to worry about declaring it (nor can you). This allows the +matcher to be polymorphic. For example, `IsDivisibleBy7()` can be used to match +any type where the value of `(arg % 7) == 0` can be implicitly converted to a +`bool`. In the `Bar(IsDivisibleBy7())` example above, if method `Bar()` takes an +`int`, `arg_type` will be `int`; if it takes an `unsigned long`, `arg_type` will +be `unsigned long`; and so on. + +### Writing New Parameterized Matchers Quickly + +Sometimes you'll want to define a matcher that has parameters. For that you can +use the macro: + +```cpp +MATCHER_P(name, param_name, description_string) { statements; } +``` + +where the description string can be either `""` or a `string` expression that +references `negation` and `param_name`. + +For example: + +```cpp +MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; } +``` + +will allow you to write: + +```cpp + EXPECT_THAT(Blah("a"), HasAbsoluteValue(n)); +``` + +which may lead to this message (assuming `n` is 10): + +```shell + Value of: Blah("a") + Expected: has absolute value 10 + Actual: -9 +``` + +Note that both the matcher description and its parameter are printed, making the +message human-friendly. + +In the matcher definition body, you can write `foo_type` to reference the type +of a parameter named `foo`. For example, in the body of +`MATCHER_P(HasAbsoluteValue, value)` above, you can write `value_type` to refer +to the type of `value`. + +gMock also provides `MATCHER_P2`, `MATCHER_P3`, ..., up to `MATCHER_P10` to +support multi-parameter matchers: + +```cpp +MATCHER_Pk(name, param_1, ..., param_k, description_string) { statements; } +``` + +Please note that the custom description string is for a particular *instance* of +the matcher, where the parameters have been bound to actual values. Therefore +usually you'll want the parameter values to be part of the description. gMock +lets you do that by referencing the matcher parameters in the description string +expression. + +For example, + +```cpp +using ::testing::PrintToString; +MATCHER_P2(InClosedRange, low, hi, + absl::StrFormat("%s in range [%s, %s]", negation ? "isn't" : "is", + PrintToString(low), PrintToString(hi))) { + return low <= arg && arg <= hi; +} +... +EXPECT_THAT(3, InClosedRange(4, 6)); +``` + +would generate a failure that contains the message: + +```shell + Expected: is in range [4, 6] +``` + +If you specify `""` as the description, the failure message will contain the +sequence of words in the matcher name followed by the parameter values printed +as a tuple. For example, + +```cpp + MATCHER_P2(InClosedRange, low, hi, "") { ... } + ... + EXPECT_THAT(3, InClosedRange(4, 6)); +``` + +would generate a failure that contains the text: + +```shell + Expected: in closed range (4, 6) +``` + +For the purpose of typing, you can view + +```cpp +MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... } +``` + +as shorthand for + +```cpp +template +FooMatcherPk +Foo(p1_type p1, ..., pk_type pk) { ... } +``` + +When you write `Foo(v1, ..., vk)`, the compiler infers the types of the +parameters `v1`, ..., and `vk` for you. If you are not happy with the result of +the type inference, you can specify the types by explicitly instantiating the +template, as in `Foo(5, false)`. As said earlier, you don't get to +(or need to) specify `arg_type` as that's determined by the context in which the +matcher is used. + +You can assign the result of expression `Foo(p1, ..., pk)` to a variable of type +`FooMatcherPk`. This can be useful when composing +matchers. Matchers that don't have a parameter or have only one parameter have +special types: you can assign `Foo()` to a `FooMatcher`-typed variable, and +assign `Foo(p)` to a `FooMatcherP`-typed variable. + +While you can instantiate a matcher template with reference types, passing the +parameters by pointer usually makes your code more readable. If, however, you +still want to pass a parameter by reference, be aware that in the failure +message generated by the matcher you will see the value of the referenced object +but not its address. + +You can overload matchers with different numbers of parameters: + +```cpp +MATCHER_P(Blah, a, description_string_1) { ... } +MATCHER_P2(Blah, a, b, description_string_2) { ... } +``` + +While it's tempting to always use the `MATCHER*` macros when defining a new +matcher, you should also consider implementing `MatcherInterface` or using +`MakePolymorphicMatcher()` instead (see the recipes that follow), especially if +you need to use the matcher a lot. While these approaches require more work, +they give you more control on the types of the value being matched and the +matcher parameters, which in general leads to better compiler error messages +that pay off in the long run. They also allow overloading matchers based on +parameter types (as opposed to just based on the number of parameters). + +### Writing New Monomorphic Matchers + +A matcher of argument type `T` implements `::testing::MatcherInterface` and +does two things: it tests whether a value of type `T` matches the matcher, and +can describe what kind of values it matches. The latter ability is used for +generating readable error messages when expectations are violated. + +The interface looks like this: + +```cpp +class MatchResultListener { + public: + ... + // Streams x to the underlying ostream; does nothing if the ostream + // is NULL. + template + MatchResultListener& operator<<(const T& x); + + // Returns the underlying ostream. + std::ostream* stream(); +}; + +template +class MatcherInterface { + public: + virtual ~MatcherInterface(); + + // Returns true if and only if the matcher matches x; also explains the match + // result to 'listener'. + virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0; + + // Describes this matcher to an ostream. + virtual void DescribeTo(std::ostream* os) const = 0; + + // Describes the negation of this matcher to an ostream. + virtual void DescribeNegationTo(std::ostream* os) const; +}; +``` + +If you need a custom matcher but `Truly()` is not a good option (for example, +you may not be happy with the way `Truly(predicate)` describes itself, or you +may want your matcher to be polymorphic as `Eq(value)` is), you can define a +matcher to do whatever you want in two steps: first implement the matcher +interface, and then define a factory function to create a matcher instance. The +second step is not strictly needed but it makes the syntax of using the matcher +nicer. + +For example, you can define a matcher to test whether an `int` is divisible by 7 +and then use it like this: + +```cpp +using ::testing::MakeMatcher; +using ::testing::Matcher; +using ::testing::MatcherInterface; +using ::testing::MatchResultListener; + +class DivisibleBy7Matcher : public MatcherInterface { + public: + bool MatchAndExplain(int n, + MatchResultListener* /* listener */) const override { + return (n % 7) == 0; + } + + void DescribeTo(std::ostream* os) const override { + *os << "is divisible by 7"; + } + + void DescribeNegationTo(std::ostream* os) const override { + *os << "is not divisible by 7"; + } +}; + +Matcher DivisibleBy7() { + return MakeMatcher(new DivisibleBy7Matcher); +} + +... + EXPECT_CALL(foo, Bar(DivisibleBy7())); +``` + +You may improve the matcher message by streaming additional information to the +`listener` argument in `MatchAndExplain()`: + +```cpp +class DivisibleBy7Matcher : public MatcherInterface { + public: + bool MatchAndExplain(int n, + MatchResultListener* listener) const override { + const int remainder = n % 7; + if (remainder != 0) { + *listener << "the remainder is " << remainder; + } + return remainder == 0; + } + ... +}; +``` + +Then, `EXPECT_THAT(x, DivisibleBy7());` may generate a message like this: + +```shell +Value of: x +Expected: is divisible by 7 + Actual: 23 (the remainder is 2) +``` + +### Writing New Polymorphic Matchers + +You've learned how to write your own matchers in the previous recipe. Just one +problem: a matcher created using `MakeMatcher()` only works for one particular +type of arguments. If you want a *polymorphic* matcher that works with arguments +of several types (for instance, `Eq(x)` can be used to match a *`value`* as long +as `value == x` compiles -- *`value`* and `x` don't have to share the same +type), you can learn the trick from `testing/base/public/gmock-matchers.h` but +it's a bit involved. + +Fortunately, most of the time you can define a polymorphic matcher easily with +the help of `MakePolymorphicMatcher()`. Here's how you can define `NotNull()` as +an example: + +```cpp +using ::testing::MakePolymorphicMatcher; +using ::testing::MatchResultListener; +using ::testing::PolymorphicMatcher; + +class NotNullMatcher { + public: + // To implement a polymorphic matcher, first define a COPYABLE class + // that has three members MatchAndExplain(), DescribeTo(), and + // DescribeNegationTo(), like the following. + + // In this example, we want to use NotNull() with any pointer, so + // MatchAndExplain() accepts a pointer of any type as its first argument. + // In general, you can define MatchAndExplain() as an ordinary method or + // a method template, or even overload it. + template + bool MatchAndExplain(T* p, + MatchResultListener* /* listener */) const { + return p != NULL; + } + + // Describes the property of a value matching this matcher. + void DescribeTo(std::ostream* os) const { *os << "is not NULL"; } + + // Describes the property of a value NOT matching this matcher. + void DescribeNegationTo(std::ostream* os) const { *os << "is NULL"; } +}; + +// To construct a polymorphic matcher, pass an instance of the class +// to MakePolymorphicMatcher(). Note the return type. +PolymorphicMatcher NotNull() { + return MakePolymorphicMatcher(NotNullMatcher()); +} + +... + + EXPECT_CALL(foo, Bar(NotNull())); // The argument must be a non-NULL pointer. +``` + +**Note:** Your polymorphic matcher class does **not** need to inherit from +`MatcherInterface` or any other class, and its methods do **not** need to be +virtual. + +Like in a monomorphic matcher, you may explain the match result by streaming +additional information to the `listener` argument in `MatchAndExplain()`. + +### Writing New Cardinalities + +A cardinality is used in `Times()` to tell gMock how many times you expect a +call to occur. It doesn't have to be exact. For example, you can say +`AtLeast(5)` or `Between(2, 4)`. + +If the [built-in set](cheat_sheet.md#CardinalityList) of cardinalities doesn't +suit you, you are free to define your own by implementing the following +interface (in namespace `testing`): + +```cpp +class CardinalityInterface { + public: + virtual ~CardinalityInterface(); + + // Returns true if and only if call_count calls will satisfy this cardinality. + virtual bool IsSatisfiedByCallCount(int call_count) const = 0; + + // Returns true if and only if call_count calls will saturate this + // cardinality. + virtual bool IsSaturatedByCallCount(int call_count) const = 0; + + // Describes self to an ostream. + virtual void DescribeTo(std::ostream* os) const = 0; +}; +``` + +For example, to specify that a call must occur even number of times, you can +write + +```cpp +using ::testing::Cardinality; +using ::testing::CardinalityInterface; +using ::testing::MakeCardinality; + +class EvenNumberCardinality : public CardinalityInterface { + public: + bool IsSatisfiedByCallCount(int call_count) const override { + return (call_count % 2) == 0; + } + + bool IsSaturatedByCallCount(int call_count) const override { + return false; + } + + void DescribeTo(std::ostream* os) const { + *os << "called even number of times"; + } +}; + +Cardinality EvenNumber() { + return MakeCardinality(new EvenNumberCardinality); +} + +... + EXPECT_CALL(foo, Bar(3)) + .Times(EvenNumber()); +``` + +### Writing New Actions Quickly {#QuickNewActions} + +If the built-in actions don't work for you, you can easily define your own one. +Just define a functor class with a (possibly templated) call operator, matching +the signature of your action. + +```cpp +struct Increment { + template + T operator()(T* arg) { + return ++(*arg); + } +} +``` + +The same approach works with stateful functors (or any callable, really): + +``` +struct MultiplyBy { + template + T operator()(T arg) { return arg * multiplier; } + + int multiplier; +} + +// Then use: +// EXPECT_CALL(...).WillOnce(MultiplyBy{7}); +``` + +#### Legacy macro-based Actions + +Before C++11, the functor-based actions were not supported; the old way of +writing actions was through a set of `ACTION*` macros. We suggest to avoid them +in new code; they hide a lot of logic behind the macro, potentially leading to +harder-to-understand compiler errors. Nevertheless, we cover them here for +completeness. + +By writing + +```cpp +ACTION(name) { statements; } +``` + +in a namespace scope (i.e. not inside a class or function), you will define an +action with the given name that executes the statements. The value returned by +`statements` will be used as the return value of the action. Inside the +statements, you can refer to the K-th (0-based) argument of the mock function as +`argK`. For example: + +```cpp +ACTION(IncrementArg1) { return ++(*arg1); } +``` + +allows you to write + +```cpp +... WillOnce(IncrementArg1()); +``` + +Note that you don't need to specify the types of the mock function arguments. +Rest assured that your code is type-safe though: you'll get a compiler error if +`*arg1` doesn't support the `++` operator, or if the type of `++(*arg1)` isn't +compatible with the mock function's return type. + +Another example: + +```cpp +ACTION(Foo) { + (*arg2)(5); + Blah(); + *arg1 = 0; + return arg0; +} +``` + +defines an action `Foo()` that invokes argument #2 (a function pointer) with 5, +calls function `Blah()`, sets the value pointed to by argument #1 to 0, and +returns argument #0. + +For more convenience and flexibility, you can also use the following pre-defined +symbols in the body of `ACTION`: + +`argK_type` | The type of the K-th (0-based) argument of the mock function +:-------------- | :----------------------------------------------------------- +`args` | All arguments of the mock function as a tuple +`args_type` | The type of all arguments of the mock function as a tuple +`return_type` | The return type of the mock function +`function_type` | The type of the mock function + +For example, when using an `ACTION` as a stub action for mock function: + +```cpp +int DoSomething(bool flag, int* ptr); +``` + +we have: + +Pre-defined Symbol | Is Bound To +------------------ | --------------------------------- +`arg0` | the value of `flag` +`arg0_type` | the type `bool` +`arg1` | the value of `ptr` +`arg1_type` | the type `int*` +`args` | the tuple `(flag, ptr)` +`args_type` | the type `std::tuple` +`return_type` | the type `int` +`function_type` | the type `int(bool, int*)` + +#### Legacy macro-based parameterized Actions + +Sometimes you'll want to parameterize an action you define. For that we have +another macro + +```cpp +ACTION_P(name, param) { statements; } +``` + +For example, + +```cpp +ACTION_P(Add, n) { return arg0 + n; } +``` + +will allow you to write + +```cpp +// Returns argument #0 + 5. +... WillOnce(Add(5)); +``` + +For convenience, we use the term *arguments* for the values used to invoke the +mock function, and the term *parameters* for the values used to instantiate an +action. + +Note that you don't need to provide the type of the parameter either. Suppose +the parameter is named `param`, you can also use the gMock-defined symbol +`param_type` to refer to the type of the parameter as inferred by the compiler. +For example, in the body of `ACTION_P(Add, n)` above, you can write `n_type` for +the type of `n`. + +gMock also provides `ACTION_P2`, `ACTION_P3`, and etc to support multi-parameter +actions. For example, + +```cpp +ACTION_P2(ReturnDistanceTo, x, y) { + double dx = arg0 - x; + double dy = arg1 - y; + return sqrt(dx*dx + dy*dy); +} +``` + +lets you write + +```cpp +... WillOnce(ReturnDistanceTo(5.0, 26.5)); +``` + +You can view `ACTION` as a degenerated parameterized action where the number of +parameters is 0. + +You can also easily define actions overloaded on the number of parameters: + +```cpp +ACTION_P(Plus, a) { ... } +ACTION_P2(Plus, a, b) { ... } +``` + +### Restricting the Type of an Argument or Parameter in an ACTION + +For maximum brevity and reusability, the `ACTION*` macros don't ask you to +provide the types of the mock function arguments and the action parameters. +Instead, we let the compiler infer the types for us. + +Sometimes, however, we may want to be more explicit about the types. There are +several tricks to do that. For example: + +```cpp +ACTION(Foo) { + // Makes sure arg0 can be converted to int. + int n = arg0; + ... use n instead of arg0 here ... +} + +ACTION_P(Bar, param) { + // Makes sure the type of arg1 is const char*. + ::testing::StaticAssertTypeEq(); + + // Makes sure param can be converted to bool. + bool flag = param; +} +``` + +where `StaticAssertTypeEq` is a compile-time assertion in googletest that +verifies two types are the same. + +### Writing New Action Templates Quickly + +Sometimes you want to give an action explicit template parameters that cannot be +inferred from its value parameters. `ACTION_TEMPLATE()` supports that and can be +viewed as an extension to `ACTION()` and `ACTION_P*()`. + +The syntax: + +```cpp +ACTION_TEMPLATE(ActionName, + HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m), + AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; } +``` + +defines an action template that takes *m* explicit template parameters and *n* +value parameters, where *m* is in [1, 10] and *n* is in [0, 10]. `name_i` is the +name of the *i*-th template parameter, and `kind_i` specifies whether it's a +`typename`, an integral constant, or a template. `p_i` is the name of the *i*-th +value parameter. + +Example: + +```cpp +// DuplicateArg(output) converts the k-th argument of the mock +// function to type T and copies it to *output. +ACTION_TEMPLATE(DuplicateArg, + // Note the comma between int and k: + HAS_2_TEMPLATE_PARAMS(int, k, typename, T), + AND_1_VALUE_PARAMS(output)) { + *output = T(std::get(args)); +} +``` + +To create an instance of an action template, write: + +```cpp +ActionName(v1, ..., v_n) +``` + +where the `t`s are the template arguments and the `v`s are the value arguments. +The value argument types are inferred by the compiler. For example: + +```cpp +using ::testing::_; +... + int n; + EXPECT_CALL(mock, Foo).WillOnce(DuplicateArg<1, unsigned char>(&n)); +``` + +If you want to explicitly specify the value argument types, you can provide +additional template arguments: + +```cpp +ActionName(v1, ..., v_n) +``` + +where `u_i` is the desired type of `v_i`. + +`ACTION_TEMPLATE` and `ACTION`/`ACTION_P*` can be overloaded on the number of +value parameters, but not on the number of template parameters. Without the +restriction, the meaning of the following is unclear: + +```cpp + OverloadedAction(x); +``` + +Are we using a single-template-parameter action where `bool` refers to the type +of `x`, or a two-template-parameter action where the compiler is asked to infer +the type of `x`? + +### Using the ACTION Object's Type + +If you are writing a function that returns an `ACTION` object, you'll need to +know its type. The type depends on the macro used to define the action and the +parameter types. The rule is relatively simple: + +| Given Definition | Expression | Has Type | +| ----------------------------- | ------------------- | --------------------- | +| `ACTION(Foo)` | `Foo()` | `FooAction` | +| `ACTION_TEMPLATE(Foo,` | `Foo()` : t_m>` : +: `AND_0_VALUE_PARAMS())` : : : +| `ACTION_P(Bar, param)` | `Bar(int_value)` | `BarActionP` | +| `ACTION_TEMPLATE(Bar,` | `Bar` | `FooActionP` : +: `AND_1_VALUE_PARAMS(p1))` : : : +| `ACTION_P2(Baz, p1, p2)` | `Baz(bool_value,` | `BazActionP2` : +| `ACTION_TEMPLATE(Baz,` | `Baz` | `FooActionP2` : +: `AND_2_VALUE_PARAMS(p1, p2))` : `int_value)` : : +| ... | ... | ... | + +Note that we have to pick different suffixes (`Action`, `ActionP`, `ActionP2`, +and etc) for actions with different numbers of value parameters, or the action +definitions cannot be overloaded on the number of them. + +### Writing New Monomorphic Actions {#NewMonoActions} + +While the `ACTION*` macros are very convenient, sometimes they are +inappropriate. For example, despite the tricks shown in the previous recipes, +they don't let you directly specify the types of the mock function arguments and +the action parameters, which in general leads to unoptimized compiler error +messages that can baffle unfamiliar users. They also don't allow overloading +actions based on parameter types without jumping through some hoops. + +An alternative to the `ACTION*` macros is to implement +`::testing::ActionInterface`, where `F` is the type of the mock function in +which the action will be used. For example: + +```cpp +template +class ActionInterface { + public: + virtual ~ActionInterface(); + + // Performs the action. Result is the return type of function type + // F, and ArgumentTuple is the tuple of arguments of F. + // + + // For example, if F is int(bool, const string&), then Result would + // be int, and ArgumentTuple would be std::tuple. + virtual Result Perform(const ArgumentTuple& args) = 0; +}; +``` + +```cpp +using ::testing::_; +using ::testing::Action; +using ::testing::ActionInterface; +using ::testing::MakeAction; + +typedef int IncrementMethod(int*); + +class IncrementArgumentAction : public ActionInterface { + public: + int Perform(const std::tuple& args) override { + int* p = std::get<0>(args); // Grabs the first argument. + return *p++; + } +}; + +Action IncrementArgument() { + return MakeAction(new IncrementArgumentAction); +} + +... + EXPECT_CALL(foo, Baz(_)) + .WillOnce(IncrementArgument()); + + int n = 5; + foo.Baz(&n); // Should return 5 and change n to 6. +``` + +### Writing New Polymorphic Actions {#NewPolyActions} + +The previous recipe showed you how to define your own action. This is all good, +except that you need to know the type of the function in which the action will +be used. Sometimes that can be a problem. For example, if you want to use the +action in functions with *different* types (e.g. like `Return()` and +`SetArgPointee()`). + +If an action can be used in several types of mock functions, we say it's +*polymorphic*. The `MakePolymorphicAction()` function template makes it easy to +define such an action: + +```cpp +namespace testing { +template +PolymorphicAction MakePolymorphicAction(const Impl& impl); +} // namespace testing +``` + +As an example, let's define an action that returns the second argument in the +mock function's argument list. The first step is to define an implementation +class: + +```cpp +class ReturnSecondArgumentAction { + public: + template + Result Perform(const ArgumentTuple& args) const { + // To get the i-th (0-based) argument, use std::get(args). + return std::get<1>(args); + } +}; +``` + +This implementation class does *not* need to inherit from any particular class. +What matters is that it must have a `Perform()` method template. This method +template takes the mock function's arguments as a tuple in a **single** +argument, and returns the result of the action. It can be either `const` or not, +but must be invokable with exactly one template argument, which is the result +type. In other words, you must be able to call `Perform(args)` where `R` is +the mock function's return type and `args` is its arguments in a tuple. + +Next, we use `MakePolymorphicAction()` to turn an instance of the implementation +class into the polymorphic action we need. It will be convenient to have a +wrapper for this: + +```cpp +using ::testing::MakePolymorphicAction; +using ::testing::PolymorphicAction; + +PolymorphicAction ReturnSecondArgument() { + return MakePolymorphicAction(ReturnSecondArgumentAction()); +} +``` + +Now, you can use this polymorphic action the same way you use the built-in ones: + +```cpp +using ::testing::_; + +class MockFoo : public Foo { + public: + MOCK_METHOD(int, DoThis, (bool flag, int n), (override)); + MOCK_METHOD(string, DoThat, (int x, const char* str1, const char* str2), + (override)); +}; + + ... + MockFoo foo; + EXPECT_CALL(foo, DoThis).WillOnce(ReturnSecondArgument()); + EXPECT_CALL(foo, DoThat).WillOnce(ReturnSecondArgument()); + ... + foo.DoThis(true, 5); // Will return 5. + foo.DoThat(1, "Hi", "Bye"); // Will return "Hi". +``` + +### Teaching gMock How to Print Your Values + +When an uninteresting or unexpected call occurs, gMock prints the argument +values and the stack trace to help you debug. Assertion macros like +`EXPECT_THAT` and `EXPECT_EQ` also print the values in question when the +assertion fails. gMock and googletest do this using googletest's user-extensible +value printer. + +This printer knows how to print built-in C++ types, native arrays, STL +containers, and any type that supports the `<<` operator. For other types, it +prints the raw bytes in the value and hopes that you the user can figure it out. +[googletest's advanced guide](../../googletest/docs/advanced.md#teaching-googletest-how-to-print-your-values) +explains how to extend the printer to do a better job at printing your +particular type than to dump the bytes. + +## Useful Mocks Created Using gMock + + + + +### Mock std::function {#MockFunction} + +`std::function` is a general function type introduced in C++11. It is a +preferred way of passing callbacks to new interfaces. Functions are copiable, +and are not usually passed around by pointer, which makes them tricky to mock. +But fear not - `MockFunction` can help you with that. + +`MockFunction` has a mock method `Call()` with the signature: + +```cpp + R Call(T1, ..., Tn); +``` + +It also has a `AsStdFunction()` method, which creates a `std::function` proxy +forwarding to Call: + +```cpp + std::function AsStdFunction(); +``` + +To use `MockFunction`, first create `MockFunction` object and set up +expectations on its `Call` method. Then pass proxy obtained from +`AsStdFunction()` to the code you are testing. For example: + +```cpp +TEST(FooTest, RunsCallbackWithBarArgument) { + // 1. Create a mock object. + MockFunction mock_function; + + // 2. Set expectations on Call() method. + EXPECT_CALL(mock_function, Call("bar")).WillOnce(Return(1)); + + // 3. Exercise code that uses std::function. + Foo(mock_function.AsStdFunction()); + // Foo's signature can be either of: + // void Foo(const std::function& fun); + // void Foo(std::function fun); + + // 4. All expectations will be verified when mock_function + // goes out of scope and is destroyed. +} +``` + +Remember that function objects created with `AsStdFunction()` are just +forwarders. If you create multiple of them, they will share the same set of +expectations. + +Although `std::function` supports unlimited number of arguments, `MockFunction` +implementation is limited to ten. If you ever hit that limit... well, your +callback has bigger problems than being mockable. :-) + + diff --git a/GraphBLAS/CUDA/test/googlemock/docs/for_dummies.md b/GraphBLAS/CUDA/test/googlemock/docs/for_dummies.md new file mode 100644 index 0000000000..327e6cc327 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/docs/for_dummies.md @@ -0,0 +1,700 @@ +## gMock for Dummies {#GMockForDummies} + + + +### What Is gMock? + +When you write a prototype or test, often it's not feasible or wise to rely on +real objects entirely. A **mock object** implements the same interface as a real +object (so it can be used as one), but lets you specify at run time how it will +be used and what it should do (which methods will be called? in which order? how +many times? with what arguments? what will they return? etc). + +**Note:** It is easy to confuse the term *fake objects* with mock objects. Fakes +and mocks actually mean very different things in the Test-Driven Development +(TDD) community: + +* **Fake** objects have working implementations, but usually take some + shortcut (perhaps to make the operations less expensive), which makes them + not suitable for production. An in-memory file system would be an example of + a fake. +* **Mocks** are objects pre-programmed with *expectations*, which form a + specification of the calls they are expected to receive. + +If all this seems too abstract for you, don't worry - the most important thing +to remember is that a mock allows you to check the *interaction* between itself +and code that uses it. The difference between fakes and mocks shall become much +clearer once you start to use mocks. + +**gMock** is a library (sometimes we also call it a "framework" to make it sound +cool) for creating mock classes and using them. It does to C++ what +jMock/EasyMock does to Java (well, more or less). + +When using gMock, + +1. first, you use some simple macros to describe the interface you want to + mock, and they will expand to the implementation of your mock class; +2. next, you create some mock objects and specify its expectations and behavior + using an intuitive syntax; +3. then you exercise code that uses the mock objects. gMock will catch any + violation to the expectations as soon as it arises. + +### Why gMock? + +While mock objects help you remove unnecessary dependencies in tests and make +them fast and reliable, using mocks manually in C++ is *hard*: + +* Someone has to implement the mocks. The job is usually tedious and + error-prone. No wonder people go great distance to avoid it. +* The quality of those manually written mocks is a bit, uh, unpredictable. You + may see some really polished ones, but you may also see some that were + hacked up in a hurry and have all sorts of ad hoc restrictions. +* The knowledge you gained from using one mock doesn't transfer to the next + one. + +In contrast, Java and Python programmers have some fine mock frameworks (jMock, +EasyMock, [Mox](http://wtf/mox), etc), which automate the creation of mocks. As +a result, mocking is a proven effective technique and widely adopted practice in +those communities. Having the right tool absolutely makes the difference. + +gMock was built to help C++ programmers. It was inspired by jMock and EasyMock, +but designed with C++'s specifics in mind. It is your friend if any of the +following problems is bothering you: + +* You are stuck with a sub-optimal design and wish you had done more + prototyping before it was too late, but prototyping in C++ is by no means + "rapid". +* Your tests are slow as they depend on too many libraries or use expensive + resources (e.g. a database). +* Your tests are brittle as some resources they use are unreliable (e.g. the + network). +* You want to test how your code handles a failure (e.g. a file checksum + error), but it's not easy to cause one. +* You need to make sure that your module interacts with other modules in the + right way, but it's hard to observe the interaction; therefore you resort to + observing the side effects at the end of the action, but it's awkward at + best. +* You want to "mock out" your dependencies, except that they don't have mock + implementations yet; and, frankly, you aren't thrilled by some of those + hand-written mocks. + +We encourage you to use gMock as + +* a *design* tool, for it lets you experiment with your interface design early + and often. More iterations lead to better designs! +* a *testing* tool to cut your tests' outbound dependencies and probe the + interaction between your module and its collaborators. + +### Getting Started + +gMock is bundled with googletest. + +### A Case for Mock Turtles + +Let's look at an example. Suppose you are developing a graphics program that +relies on a [LOGO](http://en.wikipedia.org/wiki/Logo_programming_language)-like +API for drawing. How would you test that it does the right thing? Well, you can +run it and compare the screen with a golden screen snapshot, but let's admit it: +tests like this are expensive to run and fragile (What if you just upgraded to a +shiny new graphics card that has better anti-aliasing? Suddenly you have to +update all your golden images.). It would be too painful if all your tests are +like this. Fortunately, you learned about +[Dependency Injection](http://en.wikipedia.org/wiki/Dependency_injection) and know the right thing +to do: instead of having your application talk to the system API directly, wrap +the API in an interface (say, `Turtle`) and code to that interface: + +```cpp +class Turtle { + ... + virtual ~Turtle() {}; + virtual void PenUp() = 0; + virtual void PenDown() = 0; + virtual void Forward(int distance) = 0; + virtual void Turn(int degrees) = 0; + virtual void GoTo(int x, int y) = 0; + virtual int GetX() const = 0; + virtual int GetY() const = 0; +}; +``` + +(Note that the destructor of `Turtle` **must** be virtual, as is the case for +**all** classes you intend to inherit from - otherwise the destructor of the +derived class will not be called when you delete an object through a base +pointer, and you'll get corrupted program states like memory leaks.) + +You can control whether the turtle's movement will leave a trace using `PenUp()` +and `PenDown()`, and control its movement using `Forward()`, `Turn()`, and +`GoTo()`. Finally, `GetX()` and `GetY()` tell you the current position of the +turtle. + +Your program will normally use a real implementation of this interface. In +tests, you can use a mock implementation instead. This allows you to easily +check what drawing primitives your program is calling, with what arguments, and +in which order. Tests written this way are much more robust (they won't break +because your new machine does anti-aliasing differently), easier to read and +maintain (the intent of a test is expressed in the code, not in some binary +images), and run *much, much faster*. + +### Writing the Mock Class + +If you are lucky, the mocks you need to use have already been implemented by +some nice people. If, however, you find yourself in the position to write a mock +class, relax - gMock turns this task into a fun game! (Well, almost.) + +#### How to Define It + +Using the `Turtle` interface as example, here are the simple steps you need to +follow: + +* Derive a class `MockTurtle` from `Turtle`. +* Take a *virtual* function of `Turtle` (while it's possible to + [mock non-virtual methods using templates](cook_book.md#MockingNonVirtualMethods), + it's much more involved). +* In the `public:` section of the child class, write `MOCK_METHOD();` +* Now comes the fun part: you take the function signature, cut-and-paste it + into the macro, and add two commas - one between the return type and the + name, another between the name and the argument list. +* If you're mocking a const method, add a 4th parameter containing `(const)` + (the parentheses are required). +* Since you're overriding a virtual method, we suggest adding the `override` + keyword. For const methods the 4th parameter becomes `(const, override)`, + for non-const methods just `(override)`. This isn't mandatory. +* Repeat until all virtual functions you want to mock are done. (It goes + without saying that *all* pure virtual methods in your abstract class must + be either mocked or overridden.) + +After the process, you should have something like: + +```cpp +#include "gmock/gmock.h" // Brings in gMock. + +class MockTurtle : public Turtle { + public: + ... + MOCK_METHOD(void, PenUp, (), (override)); + MOCK_METHOD(void, PenDown, (), (override)); + MOCK_METHOD(void, Forward, (int distance), (override)); + MOCK_METHOD(void, Turn, (int degrees), (override)); + MOCK_METHOD(void, GoTo, (int x, int y), (override)); + MOCK_METHOD(int, GetX, (), (const, override)); + MOCK_METHOD(int, GetY, (), (const, override)); +}; +``` + +You don't need to define these mock methods somewhere else - the `MOCK_METHOD` +macro will generate the definitions for you. It's that simple! + +#### Where to Put It + +When you define a mock class, you need to decide where to put its definition. +Some people put it in a `_test.cc`. This is fine when the interface being mocked +(say, `Foo`) is owned by the same person or team. Otherwise, when the owner of +`Foo` changes it, your test could break. (You can't really expect `Foo`'s +maintainer to fix every test that uses `Foo`, can you?) + +So, the rule of thumb is: if you need to mock `Foo` and it's owned by others, +define the mock class in `Foo`'s package (better, in a `testing` sub-package +such that you can clearly separate production code and testing utilities), put +it in a `.h` and a `cc_library`. Then everyone can reference them from their +tests. If `Foo` ever changes, there is only one copy of `MockFoo` to change, and +only tests that depend on the changed methods need to be fixed. + +Another way to do it: you can introduce a thin layer `FooAdaptor` on top of +`Foo` and code to this new interface. Since you own `FooAdaptor`, you can absorb +changes in `Foo` much more easily. While this is more work initially, carefully +choosing the adaptor interface can make your code easier to write and more +readable (a net win in the long run), as you can choose `FooAdaptor` to fit your +specific domain much better than `Foo` does. + + + +### Using Mocks in Tests + +Once you have a mock class, using it is easy. The typical work flow is: + +1. Import the gMock names from the `testing` namespace such that you can use + them unqualified (You only have to do it once per file). Remember that + namespaces are a good idea. +2. Create some mock objects. +3. Specify your expectations on them (How many times will a method be called? + With what arguments? What should it do? etc.). +4. Exercise some code that uses the mocks; optionally, check the result using + googletest assertions. If a mock method is called more than expected or with + wrong arguments, you'll get an error immediately. +5. When a mock is destructed, gMock will automatically check whether all + expectations on it have been satisfied. + +Here's an example: + +```cpp +#include "path/to/mock-turtle.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using ::testing::AtLeast; // #1 + +TEST(PainterTest, CanDrawSomething) { + MockTurtle turtle; // #2 + EXPECT_CALL(turtle, PenDown()) // #3 + .Times(AtLeast(1)); + + Painter painter(&turtle); // #4 + + EXPECT_TRUE(painter.DrawCircle(0, 0, 10)); // #5 +} +``` + +As you might have guessed, this test checks that `PenDown()` is called at least +once. If the `painter` object didn't call this method, your test will fail with +a message like this: + +```text +path/to/my_test.cc:119: Failure +Actual function call count doesn't match this expectation: +Actually: never called; +Expected: called at least once. +Stack trace: +... +``` + +**Tip 1:** If you run the test from an Emacs buffer, you can hit on the +line number to jump right to the failed expectation. + +**Tip 2:** If your mock objects are never deleted, the final verification won't +happen. Therefore it's a good idea to turn on the heap checker in your tests +when you allocate mocks on the heap. You get that automatically if you use the +`gtest_main` library already. + +**Important note:** gMock requires expectations to be set **before** the mock +functions are called, otherwise the behavior is **undefined**. In particular, +you mustn't interleave `EXPECT_CALL()s` and calls to the mock functions. + +This means `EXPECT_CALL()` should be read as expecting that a call will occur +*in the future*, not that a call has occurred. Why does gMock work like that? +Well, specifying the expectation beforehand allows gMock to report a violation +as soon as it rises, when the context (stack trace, etc) is still available. +This makes debugging much easier. + +Admittedly, this test is contrived and doesn't do much. You can easily achieve +the same effect without using gMock. However, as we shall reveal soon, gMock +allows you to do *so much more* with the mocks. + +### Setting Expectations + +The key to using a mock object successfully is to set the *right expectations* +on it. If you set the expectations too strict, your test will fail as the result +of unrelated changes. If you set them too loose, bugs can slip through. You want +to do it just right such that your test can catch exactly the kind of bugs you +intend it to catch. gMock provides the necessary means for you to do it "just +right." + +#### General Syntax + +In gMock we use the `EXPECT_CALL()` macro to set an expectation on a mock +method. The general syntax is: + +```cpp +EXPECT_CALL(mock_object, method(matchers)) + .Times(cardinality) + .WillOnce(action) + .WillRepeatedly(action); +``` + +The macro has two arguments: first the mock object, and then the method and its +arguments. Note that the two are separated by a comma (`,`), not a period (`.`). +(Why using a comma? The answer is that it was necessary for technical reasons.) +If the method is not overloaded, the macro can also be called without matchers: + +```cpp +EXPECT_CALL(mock_object, non-overloaded-method) + .Times(cardinality) + .WillOnce(action) + .WillRepeatedly(action); +``` + +This syntax allows the test writer to specify "called with any arguments" +without explicitly specifying the number or types of arguments. To avoid +unintended ambiguity, this syntax may only be used for methods which are not +overloaded + +Either form of the macro can be followed by some optional *clauses* that provide +more information about the expectation. We'll discuss how each clause works in +the coming sections. + +This syntax is designed to make an expectation read like English. For example, +you can probably guess that + +```cpp +using ::testing::Return; +... +EXPECT_CALL(turtle, GetX()) + .Times(5) + .WillOnce(Return(100)) + .WillOnce(Return(150)) + .WillRepeatedly(Return(200)); +``` + +says that the `turtle` object's `GetX()` method will be called five times, it +will return 100 the first time, 150 the second time, and then 200 every time. +Some people like to call this style of syntax a Domain-Specific Language (DSL). + +**Note:** Why do we use a macro to do this? Well it serves two purposes: first +it makes expectations easily identifiable (either by `gsearch` or by a human +reader), and second it allows gMock to include the source file location of a +failed expectation in messages, making debugging easier. + +#### Matchers: What Arguments Do We Expect? + +When a mock function takes arguments, we may specify what arguments we are +expecting, for example: + +```cpp +// Expects the turtle to move forward by 100 units. +EXPECT_CALL(turtle, Forward(100)); +``` + +Oftentimes you do not want to be too specific. Remember that talk about tests +being too rigid? Over specification leads to brittle tests and obscures the +intent of tests. Therefore we encourage you to specify only what's necessary—no +more, no less. If you aren't interested in the value of an argument, write `_` +as the argument, which means "anything goes": + +```cpp +using ::testing::_; +... +// Expects that the turtle jumps to somewhere on the x=50 line. +EXPECT_CALL(turtle, GoTo(50, _)); +``` + +`_` is an instance of what we call **matchers**. A matcher is like a predicate +and can test whether an argument is what we'd expect. You can use a matcher +inside `EXPECT_CALL()` wherever a function argument is expected. `_` is a +convenient way of saying "any value". + +In the above examples, `100` and `50` are also matchers; implicitly, they are +the same as `Eq(100)` and `Eq(50)`, which specify that the argument must be +equal (using `operator==`) to the matcher argument. There are many +[built-in matchers](cheat_sheet.md#MatcherList) for common types (as well as +[custom matchers](cook_book.md#NewMatchers)); for example: + +```cpp +using ::testing::Ge; +... +// Expects the turtle moves forward by at least 100. +EXPECT_CALL(turtle, Forward(Ge(100))); +``` + +If you don't care about *any* arguments, rather than specify `_` for each of +them you may instead omit the parameter list: + +```cpp +// Expects the turtle to move forward. +EXPECT_CALL(turtle, Forward); +// Expects the turtle to jump somewhere. +EXPECT_CALL(turtle, GoTo); +``` + +This works for all non-overloaded methods; if a method is overloaded, you need +to help gMock resolve which overload is expected by specifying the number of +arguments and possibly also the +[types of the arguments](cook_book.md#SelectOverload). + +#### Cardinalities: How Many Times Will It Be Called? + +The first clause we can specify following an `EXPECT_CALL()` is `Times()`. We +call its argument a **cardinality** as it tells *how many times* the call should +occur. It allows us to repeat an expectation many times without actually writing +it as many times. More importantly, a cardinality can be "fuzzy", just like a +matcher can be. This allows a user to express the intent of a test exactly. + +An interesting special case is when we say `Times(0)`. You may have guessed - it +means that the function shouldn't be called with the given arguments at all, and +gMock will report a googletest failure whenever the function is (wrongfully) +called. + +We've seen `AtLeast(n)` as an example of fuzzy cardinalities earlier. For the +list of built-in cardinalities you can use, see +[here](cheat_sheet.md#CardinalityList). + +The `Times()` clause can be omitted. **If you omit `Times()`, gMock will infer +the cardinality for you.** The rules are easy to remember: + +* If **neither** `WillOnce()` **nor** `WillRepeatedly()` is in the + `EXPECT_CALL()`, the inferred cardinality is `Times(1)`. +* If there are *n* `WillOnce()`'s but **no** `WillRepeatedly()`, where *n* >= + 1, the cardinality is `Times(n)`. +* If there are *n* `WillOnce()`'s and **one** `WillRepeatedly()`, where *n* >= + 0, the cardinality is `Times(AtLeast(n))`. + +**Quick quiz:** what do you think will happen if a function is expected to be +called twice but actually called four times? + +#### Actions: What Should It Do? + +Remember that a mock object doesn't really have a working implementation? We as +users have to tell it what to do when a method is invoked. This is easy in +gMock. + +First, if the return type of a mock function is a built-in type or a pointer, +the function has a **default action** (a `void` function will just return, a +`bool` function will return `false`, and other functions will return 0). In +addition, in C++ 11 and above, a mock function whose return type is +default-constructible (i.e. has a default constructor) has a default action of +returning a default-constructed value. If you don't say anything, this behavior +will be used. + +Second, if a mock function doesn't have a default action, or the default action +doesn't suit you, you can specify the action to be taken each time the +expectation matches using a series of `WillOnce()` clauses followed by an +optional `WillRepeatedly()`. For example, + +```cpp +using ::testing::Return; +... +EXPECT_CALL(turtle, GetX()) + .WillOnce(Return(100)) + .WillOnce(Return(200)) + .WillOnce(Return(300)); +``` + +says that `turtle.GetX()` will be called *exactly three times* (gMock inferred +this from how many `WillOnce()` clauses we've written, since we didn't +explicitly write `Times()`), and will return 100, 200, and 300 respectively. + +```cpp +using ::testing::Return; +... +EXPECT_CALL(turtle, GetY()) + .WillOnce(Return(100)) + .WillOnce(Return(200)) + .WillRepeatedly(Return(300)); +``` + +says that `turtle.GetY()` will be called *at least twice* (gMock knows this as +we've written two `WillOnce()` clauses and a `WillRepeatedly()` while having no +explicit `Times()`), will return 100 and 200 respectively the first two times, +and 300 from the third time on. + +Of course, if you explicitly write a `Times()`, gMock will not try to infer the +cardinality itself. What if the number you specified is larger than there are +`WillOnce()` clauses? Well, after all `WillOnce()`s are used up, gMock will do +the *default* action for the function every time (unless, of course, you have a +`WillRepeatedly()`.). + +What can we do inside `WillOnce()` besides `Return()`? You can return a +reference using `ReturnRef(*variable*)`, or invoke a pre-defined function, among +[others](cook_book.md#using-actions). + +**Important note:** The `EXPECT_CALL()` statement evaluates the action clause +only once, even though the action may be performed many times. Therefore you +must be careful about side effects. The following may not do what you want: + +```cpp +using ::testing::Return; +... +int n = 100; +EXPECT_CALL(turtle, GetX()) + .Times(4) + .WillRepeatedly(Return(n++)); +``` + +Instead of returning 100, 101, 102, ..., consecutively, this mock function will +always return 100 as `n++` is only evaluated once. Similarly, `Return(new Foo)` +will create a new `Foo` object when the `EXPECT_CALL()` is executed, and will +return the same pointer every time. If you want the side effect to happen every +time, you need to define a custom action, which we'll teach in the +[cook book](http://). + +Time for another quiz! What do you think the following means? + +```cpp +using ::testing::Return; +... +EXPECT_CALL(turtle, GetY()) + .Times(4) + .WillOnce(Return(100)); +``` + +Obviously `turtle.GetY()` is expected to be called four times. But if you think +it will return 100 every time, think twice! Remember that one `WillOnce()` +clause will be consumed each time the function is invoked and the default action +will be taken afterwards. So the right answer is that `turtle.GetY()` will +return 100 the first time, but **return 0 from the second time on**, as +returning 0 is the default action for `int` functions. + +#### Using Multiple Expectations {#MultiExpectations} + +So far we've only shown examples where you have a single expectation. More +realistically, you'll specify expectations on multiple mock methods which may be +from multiple mock objects. + +By default, when a mock method is invoked, gMock will search the expectations in +the **reverse order** they are defined, and stop when an active expectation that +matches the arguments is found (you can think of it as "newer rules override +older ones."). If the matching expectation cannot take any more calls, you will +get an upper-bound-violated failure. Here's an example: + +```cpp +using ::testing::_; +... +EXPECT_CALL(turtle, Forward(_)); // #1 +EXPECT_CALL(turtle, Forward(10)) // #2 + .Times(2); +``` + +If `Forward(10)` is called three times in a row, the third time it will be an +error, as the last matching expectation (#2) has been saturated. If, however, +the third `Forward(10)` call is replaced by `Forward(20)`, then it would be OK, +as now #1 will be the matching expectation. + +**Note:** Why does gMock search for a match in the *reverse* order of the +expectations? The reason is that this allows a user to set up the default +expectations in a mock object's constructor or the test fixture's set-up phase +and then customize the mock by writing more specific expectations in the test +body. So, if you have two expectations on the same method, you want to put the +one with more specific matchers **after** the other, or the more specific rule +would be shadowed by the more general one that comes after it. + +**Tip:** It is very common to start with a catch-all expectation for a method +and `Times(AnyNumber())` (omitting arguments, or with `_` for all arguments, if +overloaded). This makes any calls to the method expected. This is not necessary +for methods that are not mentioned at all (these are "uninteresting"), but is +useful for methods that have some expectations, but for which other calls are +ok. See +[Understanding Uninteresting vs Unexpected Calls](cook_book.md#uninteresting-vs-unexpected). + +#### Ordered vs Unordered Calls {#OrderedCalls} + +By default, an expectation can match a call even though an earlier expectation +hasn't been satisfied. In other words, the calls don't have to occur in the +order the expectations are specified. + +Sometimes, you may want all the expected calls to occur in a strict order. To +say this in gMock is easy: + +```cpp +using ::testing::InSequence; +... +TEST(FooTest, DrawsLineSegment) { + ... + { + InSequence seq; + + EXPECT_CALL(turtle, PenDown()); + EXPECT_CALL(turtle, Forward(100)); + EXPECT_CALL(turtle, PenUp()); + } + Foo(); +} +``` + +By creating an object of type `InSequence`, all expectations in its scope are +put into a *sequence* and have to occur *sequentially*. Since we are just +relying on the constructor and destructor of this object to do the actual work, +its name is really irrelevant. + +In this example, we test that `Foo()` calls the three expected functions in the +order as written. If a call is made out-of-order, it will be an error. + +(What if you care about the relative order of some of the calls, but not all of +them? Can you specify an arbitrary partial order? The answer is ... yes! The +details can be found [here](cook_book.md#OrderedCalls).) + +#### All Expectations Are Sticky (Unless Said Otherwise) {#StickyExpectations} + +Now let's do a quick quiz to see how well you can use this mock stuff already. +How would you test that the turtle is asked to go to the origin *exactly twice* +(you want to ignore any other instructions it receives)? + +After you've come up with your answer, take a look at ours and compare notes +(solve it yourself first - don't cheat!): + +```cpp +using ::testing::_; +using ::testing::AnyNumber; +... +EXPECT_CALL(turtle, GoTo(_, _)) // #1 + .Times(AnyNumber()); +EXPECT_CALL(turtle, GoTo(0, 0)) // #2 + .Times(2); +``` + +Suppose `turtle.GoTo(0, 0)` is called three times. In the third time, gMock will +see that the arguments match expectation #2 (remember that we always pick the +last matching expectation). Now, since we said that there should be only two +such calls, gMock will report an error immediately. This is basically what we've +told you in the [Using Multiple Expectations](#MultiExpectations) section above. + +This example shows that **expectations in gMock are "sticky" by default**, in +the sense that they remain active even after we have reached their invocation +upper bounds. This is an important rule to remember, as it affects the meaning +of the spec, and is **different** to how it's done in many other mocking +frameworks (Why'd we do that? Because we think our rule makes the common cases +easier to express and understand.). + +Simple? Let's see if you've really understood it: what does the following code +say? + +```cpp +using ::testing::Return; +... +for (int i = n; i > 0; i--) { + EXPECT_CALL(turtle, GetX()) + .WillOnce(Return(10*i)); +} +``` + +If you think it says that `turtle.GetX()` will be called `n` times and will +return 10, 20, 30, ..., consecutively, think twice! The problem is that, as we +said, expectations are sticky. So, the second time `turtle.GetX()` is called, +the last (latest) `EXPECT_CALL()` statement will match, and will immediately +lead to an "upper bound violated" error - this piece of code is not very useful! + +One correct way of saying that `turtle.GetX()` will return 10, 20, 30, ..., is +to explicitly say that the expectations are *not* sticky. In other words, they +should *retire* as soon as they are saturated: + +```cpp +using ::testing::Return; +... +for (int i = n; i > 0; i--) { + EXPECT_CALL(turtle, GetX()) + .WillOnce(Return(10*i)) + .RetiresOnSaturation(); +} +``` + +And, there's a better way to do it: in this case, we expect the calls to occur +in a specific order, and we line up the actions to match the order. Since the +order is important here, we should make it explicit using a sequence: + +```cpp +using ::testing::InSequence; +using ::testing::Return; +... +{ + InSequence s; + + for (int i = 1; i <= n; i++) { + EXPECT_CALL(turtle, GetX()) + .WillOnce(Return(10*i)) + .RetiresOnSaturation(); + } +} +``` + +By the way, the other situation where an expectation may *not* be sticky is when +it's in a sequence - as soon as another expectation that comes after it in the +sequence has been used, it automatically retires (and will never be used to +match any call). + +#### Uninteresting Calls + +A mock object may have many methods, and not all of them are that interesting. +For example, in some tests we may not care about how many times `GetX()` and +`GetY()` get called. + +In gMock, if you are not interested in a method, just don't say anything about +it. If a call to this method occurs, you'll see a warning in the test output, +but it won't be a failure. This is called "naggy" behavior; to change, see +[The Nice, the Strict, and the Naggy](cook_book.md#NiceStrictNaggy). diff --git a/GraphBLAS/CUDA/test/googlemock/docs/gmock_faq.md b/GraphBLAS/CUDA/test/googlemock/docs/gmock_faq.md new file mode 100644 index 0000000000..214aabf121 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/docs/gmock_faq.md @@ -0,0 +1,396 @@ +## Legacy gMock FAQ {#GMockFaq} + + + +### When I call a method on my mock object, the method for the real object is invoked instead. What's the problem? + +In order for a method to be mocked, it must be *virtual*, unless you use the +[high-perf dependency injection technique](#MockingNonVirtualMethods). + +### Can I mock a variadic function? + +You cannot mock a variadic function (i.e. a function taking ellipsis (`...`) +arguments) directly in gMock. + +The problem is that in general, there is *no way* for a mock object to know how +many arguments are passed to the variadic method, and what the arguments' types +are. Only the *author of the base class* knows the protocol, and we cannot look +into his or her head. + +Therefore, to mock such a function, the *user* must teach the mock object how to +figure out the number of arguments and their types. One way to do it is to +provide overloaded versions of the function. + +Ellipsis arguments are inherited from C and not really a C++ feature. They are +unsafe to use and don't work with arguments that have constructors or +destructors. Therefore we recommend to avoid them in C++ as much as possible. + +### MSVC gives me warning C4301 or C4373 when I define a mock method with a const parameter. Why? + +If you compile this using Microsoft Visual C++ 2005 SP1: + +```cpp +class Foo { + ... + virtual void Bar(const int i) = 0; +}; + +class MockFoo : public Foo { + ... + MOCK_METHOD(void, Bar, (const int i), (override)); +}; +``` + +You may get the following warning: + +```shell +warning C4301: 'MockFoo::Bar': overriding virtual function only differs from 'Foo::Bar' by const/volatile qualifier +``` + +This is a MSVC bug. The same code compiles fine with gcc, for example. If you +use Visual C++ 2008 SP1, you would get the warning: + +```shell +warning C4373: 'MockFoo::Bar': virtual function overrides 'Foo::Bar', previous versions of the compiler did not override when parameters only differed by const/volatile qualifiers +``` + +In C++, if you *declare* a function with a `const` parameter, the `const` +modifier is ignored. Therefore, the `Foo` base class above is equivalent to: + +```cpp +class Foo { + ... + virtual void Bar(int i) = 0; // int or const int? Makes no difference. +}; +``` + +In fact, you can *declare* `Bar()` with an `int` parameter, and define it with a +`const int` parameter. The compiler will still match them up. + +Since making a parameter `const` is meaningless in the method declaration, we +recommend to remove it in both `Foo` and `MockFoo`. That should workaround the +VC bug. + +Note that we are talking about the *top-level* `const` modifier here. If the +function parameter is passed by pointer or reference, declaring the pointee or +referee as `const` is still meaningful. For example, the following two +declarations are *not* equivalent: + +```cpp +void Bar(int* p); // Neither p nor *p is const. +void Bar(const int* p); // p is not const, but *p is. +``` + + + +### I can't figure out why gMock thinks my expectations are not satisfied. What should I do? + +You might want to run your test with `--gmock_verbose=info`. This flag lets +gMock print a trace of every mock function call it receives. By studying the +trace, you'll gain insights on why the expectations you set are not met. + +If you see the message "The mock function has no default action set, and its +return type has no default value set.", then try +[adding a default action](for_dummies.md#DefaultValue). Due to a known issue, +unexpected calls on mocks without default actions don't print out a detailed +comparison between the actual arguments and the expected arguments. + +### My program crashed and `ScopedMockLog` spit out tons of messages. Is it a gMock bug? + +gMock and `ScopedMockLog` are likely doing the right thing here. + +When a test crashes, the failure signal handler will try to log a lot of +information (the stack trace, and the address map, for example). The messages +are compounded if you have many threads with depth stacks. When `ScopedMockLog` +intercepts these messages and finds that they don't match any expectations, it +prints an error for each of them. + +You can learn to ignore the errors, or you can rewrite your expectations to make +your test more robust, for example, by adding something like: + +```cpp +using ::testing::AnyNumber; +using ::testing::Not; +... + // Ignores any log not done by us. + EXPECT_CALL(log, Log(_, Not(EndsWith("/my_file.cc")), _)) + .Times(AnyNumber()); +``` + +### How can I assert that a function is NEVER called? + +```cpp +using ::testing::_; +... + EXPECT_CALL(foo, Bar(_)) + .Times(0); +``` + + + +### I have a failed test where gMock tells me TWICE that a particular expectation is not satisfied. Isn't this redundant? + +When gMock detects a failure, it prints relevant information (the mock function +arguments, the state of relevant expectations, and etc) to help the user debug. +If another failure is detected, gMock will do the same, including printing the +state of relevant expectations. + +Sometimes an expectation's state didn't change between two failures, and you'll +see the same description of the state twice. They are however *not* redundant, +as they refer to *different points in time*. The fact they are the same *is* +interesting information. + +### I get a heapcheck failure when using a mock object, but using a real object is fine. What can be wrong? + +Does the class (hopefully a pure interface) you are mocking have a virtual +destructor? + +Whenever you derive from a base class, make sure its destructor is virtual. +Otherwise Bad Things will happen. Consider the following code: + +```cpp +class Base { + public: + // Not virtual, but should be. + ~Base() { ... } + ... +}; + +class Derived : public Base { + public: + ... + private: + std::string value_; +}; + +... + Base* p = new Derived; + ... + delete p; // Surprise! ~Base() will be called, but ~Derived() will not + // - value_ is leaked. +``` + +By changing `~Base()` to virtual, `~Derived()` will be correctly called when +`delete p` is executed, and the heap checker will be happy. + +### The "newer expectations override older ones" rule makes writing expectations awkward. Why does gMock do that? + +When people complain about this, often they are referring to code like: + +```cpp +using ::testing::Return; +... + // foo.Bar() should be called twice, return 1 the first time, and return + // 2 the second time. However, I have to write the expectations in the + // reverse order. This sucks big time!!! + EXPECT_CALL(foo, Bar()) + .WillOnce(Return(2)) + .RetiresOnSaturation(); + EXPECT_CALL(foo, Bar()) + .WillOnce(Return(1)) + .RetiresOnSaturation(); +``` + +The problem, is that they didn't pick the **best** way to express the test's +intent. + +By default, expectations don't have to be matched in *any* particular order. If +you want them to match in a certain order, you need to be explicit. This is +gMock's (and jMock's) fundamental philosophy: it's easy to accidentally +over-specify your tests, and we want to make it harder to do so. + +There are two better ways to write the test spec. You could either put the +expectations in sequence: + +```cpp +using ::testing::Return; +... + // foo.Bar() should be called twice, return 1 the first time, and return + // 2 the second time. Using a sequence, we can write the expectations + // in their natural order. + { + InSequence s; + EXPECT_CALL(foo, Bar()) + .WillOnce(Return(1)) + .RetiresOnSaturation(); + EXPECT_CALL(foo, Bar()) + .WillOnce(Return(2)) + .RetiresOnSaturation(); + } +``` + +or you can put the sequence of actions in the same expectation: + +```cpp +using ::testing::Return; +... + // foo.Bar() should be called twice, return 1 the first time, and return + // 2 the second time. + EXPECT_CALL(foo, Bar()) + .WillOnce(Return(1)) + .WillOnce(Return(2)) + .RetiresOnSaturation(); +``` + +Back to the original questions: why does gMock search the expectations (and +`ON_CALL`s) from back to front? Because this allows a user to set up a mock's +behavior for the common case early (e.g. in the mock's constructor or the test +fixture's set-up phase) and customize it with more specific rules later. If +gMock searches from front to back, this very useful pattern won't be possible. + +### gMock prints a warning when a function without EXPECT_CALL is called, even if I have set its behavior using ON_CALL. Would it be reasonable not to show the warning in this case? + +When choosing between being neat and being safe, we lean toward the latter. So +the answer is that we think it's better to show the warning. + +Often people write `ON_CALL`s in the mock object's constructor or `SetUp()`, as +the default behavior rarely changes from test to test. Then in the test body +they set the expectations, which are often different for each test. Having an +`ON_CALL` in the set-up part of a test doesn't mean that the calls are expected. +If there's no `EXPECT_CALL` and the method is called, it's possibly an error. If +we quietly let the call go through without notifying the user, bugs may creep in +unnoticed. + +If, however, you are sure that the calls are OK, you can write + +```cpp +using ::testing::_; +... + EXPECT_CALL(foo, Bar(_)) + .WillRepeatedly(...); +``` + +instead of + +```cpp +using ::testing::_; +... + ON_CALL(foo, Bar(_)) + .WillByDefault(...); +``` + +This tells gMock that you do expect the calls and no warning should be printed. + +Also, you can control the verbosity by specifying `--gmock_verbose=error`. Other +values are `info` and `warning`. If you find the output too noisy when +debugging, just choose a less verbose level. + +### How can I delete the mock function's argument in an action? + +If your mock function takes a pointer argument and you want to delete that +argument, you can use testing::DeleteArg() to delete the N'th (zero-indexed) +argument: + +```cpp +using ::testing::_; + ... + MOCK_METHOD(void, Bar, (X* x, const Y& y)); + ... + EXPECT_CALL(mock_foo_, Bar(_, _)) + .WillOnce(testing::DeleteArg<0>())); +``` + +### How can I perform an arbitrary action on a mock function's argument? + +If you find yourself needing to perform some action that's not supported by +gMock directly, remember that you can define your own actions using +[`MakeAction()`](#NewMonoActions) or +[`MakePolymorphicAction()`](#NewPolyActions), or you can write a stub function +and invoke it using [`Invoke()`](#FunctionsAsActions). + +```cpp +using ::testing::_; +using ::testing::Invoke; + ... + MOCK_METHOD(void, Bar, (X* p)); + ... + EXPECT_CALL(mock_foo_, Bar(_)) + .WillOnce(Invoke(MyAction(...))); +``` + +### My code calls a static/global function. Can I mock it? + +You can, but you need to make some changes. + +In general, if you find yourself needing to mock a static function, it's a sign +that your modules are too tightly coupled (and less flexible, less reusable, +less testable, etc). You are probably better off defining a small interface and +call the function through that interface, which then can be easily mocked. It's +a bit of work initially, but usually pays for itself quickly. + +This Google Testing Blog +[post](https://testing.googleblog.com/2008/06/defeat-static-cling.html) says it +excellently. Check it out. + +### My mock object needs to do complex stuff. It's a lot of pain to specify the actions. gMock sucks! + +I know it's not a question, but you get an answer for free any way. :-) + +With gMock, you can create mocks in C++ easily. And people might be tempted to +use them everywhere. Sometimes they work great, and sometimes you may find them, +well, a pain to use. So, what's wrong in the latter case? + +When you write a test without using mocks, you exercise the code and assert that +it returns the correct value or that the system is in an expected state. This is +sometimes called "state-based testing". + +Mocks are great for what some call "interaction-based" testing: instead of +checking the system state at the very end, mock objects verify that they are +invoked the right way and report an error as soon as it arises, giving you a +handle on the precise context in which the error was triggered. This is often +more effective and economical to do than state-based testing. + +If you are doing state-based testing and using a test double just to simulate +the real object, you are probably better off using a fake. Using a mock in this +case causes pain, as it's not a strong point for mocks to perform complex +actions. If you experience this and think that mocks suck, you are just not +using the right tool for your problem. Or, you might be trying to solve the +wrong problem. :-) + +### I got a warning "Uninteresting function call encountered - default action taken.." Should I panic? + +By all means, NO! It's just an FYI. :-) + +What it means is that you have a mock function, you haven't set any expectations +on it (by gMock's rule this means that you are not interested in calls to this +function and therefore it can be called any number of times), and it is called. +That's OK - you didn't say it's not OK to call the function! + +What if you actually meant to disallow this function to be called, but forgot to +write `EXPECT_CALL(foo, Bar()).Times(0)`? While one can argue that it's the +user's fault, gMock tries to be nice and prints you a note. + +So, when you see the message and believe that there shouldn't be any +uninteresting calls, you should investigate what's going on. To make your life +easier, gMock dumps the stack trace when an uninteresting call is encountered. +From that you can figure out which mock function it is, and how it is called. + +### I want to define a custom action. Should I use Invoke() or implement the ActionInterface interface? + +Either way is fine - you want to choose the one that's more convenient for your +circumstance. + +Usually, if your action is for a particular function type, defining it using +`Invoke()` should be easier; if your action can be used in functions of +different types (e.g. if you are defining `Return(*value*)`), +`MakePolymorphicAction()` is easiest. Sometimes you want precise control on what +types of functions the action can be used in, and implementing `ActionInterface` +is the way to go here. See the implementation of `Return()` in +`testing/base/public/gmock-actions.h` for an example. + +### I use SetArgPointee() in WillOnce(), but gcc complains about "conflicting return type specified". What does it mean? + +You got this error as gMock has no idea what value it should return when the +mock method is called. `SetArgPointee()` says what the side effect is, but +doesn't say what the return value should be. You need `DoAll()` to chain a +`SetArgPointee()` with a `Return()` that provides a value appropriate to the API +being mocked. + +See this [recipe](cook_book.md#mocking-side-effects) for more details and an +example. + +### I have a huge mock class, and Microsoft Visual C++ runs out of memory when compiling it. What can I do? + +We've noticed that when the `/clr` compiler flag is used, Visual C++ uses 5~6 +times as much memory when compiling a mock class. We suggest to avoid `/clr` +when compiling native C++ mocks. diff --git a/GraphBLAS/CUDA/test/googlemock/docs/pump_manual.md b/GraphBLAS/CUDA/test/googlemock/docs/pump_manual.md new file mode 100644 index 0000000000..cdf7c57da2 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/docs/pump_manual.md @@ -0,0 +1,187 @@ +Pump is Useful for Meta Programming. + +# The Problem + +Template and macro libraries often need to define many classes, functions, or +macros that vary only (or almost only) in the number of arguments they take. +It's a lot of repetitive, mechanical, and error-prone work. + +Our experience is that it's tedious to write custom scripts, which tend to +reflect the structure of the generated code poorly and are often hard to read +and edit. For example, a small change needed in the generated code may require +some non-intuitive, non-trivial changes in the script. This is especially +painful when experimenting with the code. + +This script may be useful for generating meta code, for example a series of +macros of FOO1, FOO2, etc. Nevertheless, please make it your last resort +technique by favouring C++ template metaprogramming or variadic macros. + +# Our Solution + +Pump (for Pump is Useful for Meta Programming, Pretty Useful for Meta +Programming, or Practical Utility for Meta Programming, whichever you prefer) is +a simple meta-programming tool for C++. The idea is that a programmer writes a +`foo.pump` file which contains C++ code plus meta code that manipulates the C++ +code. The meta code can handle iterations over a range, nested iterations, local +meta variable definitions, simple arithmetic, and conditional expressions. You +can view it as a small Domain-Specific Language. The meta language is designed +to be non-intrusive (s.t. it won't confuse Emacs' C++ mode, for example) and +concise, making Pump code intuitive and easy to maintain. + +## Highlights + +* The implementation is in a single Python script and thus ultra portable: no + build or installation is needed and it works cross platforms. +* Pump tries to be smart with respect to + [Google's style guide](https://github.com/google/styleguide): it breaks long + lines (easy to have when they are generated) at acceptable places to fit + within 80 columns and indent the continuation lines correctly. +* The format is human-readable and more concise than XML. +* The format works relatively well with Emacs' C++ mode. + +## Examples + +The following Pump code (where meta keywords start with `$`, `[[` and `]]` are +meta brackets, and `$$` starts a meta comment that ends with the line): + +``` +$var n = 3 $$ Defines a meta variable n. +$range i 0..n $$ Declares the range of meta iterator i (inclusive). +$for i [[ + $$ Meta loop. +// Foo$i does blah for $i-ary predicates. +$range j 1..i +template +class Foo$i { +$if i == 0 [[ + blah a; +]] $elif i <= 2 [[ + blah b; +]] $else [[ + blah c; +]] +}; + +]] +``` + +will be translated by the Pump compiler to: + +```cpp +// Foo0 does blah for 0-ary predicates. +template +class Foo0 { + blah a; +}; + +// Foo1 does blah for 1-ary predicates. +template +class Foo1 { + blah b; +}; + +// Foo2 does blah for 2-ary predicates. +template +class Foo2 { + blah b; +}; + +// Foo3 does blah for 3-ary predicates. +template +class Foo3 { + blah c; +}; +``` + +In another example, + +``` +$range i 1..n +Func($for i + [[a$i]]); +$$ The text between i and [[ is the separator between iterations. +``` + +will generate one of the following lines (without the comments), depending on +the value of `n`: + +```cpp +Func(); // If n is 0. +Func(a1); // If n is 1. +Func(a1 + a2); // If n is 2. +Func(a1 + a2 + a3); // If n is 3. +// And so on... +``` + +## Constructs + +We support the following meta programming constructs: + +| `$var id = exp` | Defines a named constant value. `$id` is | +: : valid util the end of the current meta : +: : lexical block. : +| :------------------------------- | :--------------------------------------- | +| `$range id exp..exp` | Sets the range of an iteration variable, | +: : which can be reused in multiple loops : +: : later. : +| `$for id sep [[ code ]]` | Iteration. The range of `id` must have | +: : been defined earlier. `$id` is valid in : +: : `code`. : +| `$($)` | Generates a single `$` character. | +| `$id` | Value of the named constant or iteration | +: : variable. : +| `$(exp)` | Value of the expression. | +| `$if exp [[ code ]] else_branch` | Conditional. | +| `[[ code ]]` | Meta lexical block. | +| `cpp_code` | Raw C++ code. | +| `$$ comment` | Meta comment. | + +**Note:** To give the user some freedom in formatting the Pump source code, Pump +ignores a new-line character if it's right after `$for foo` or next to `[[` or +`]]`. Without this rule you'll often be forced to write very long lines to get +the desired output. Therefore sometimes you may need to insert an extra new-line +in such places for a new-line to show up in your output. + +## Grammar + +```ebnf +code ::= atomic_code* +atomic_code ::= $var id = exp + | $var id = [[ code ]] + | $range id exp..exp + | $for id sep [[ code ]] + | $($) + | $id + | $(exp) + | $if exp [[ code ]] else_branch + | [[ code ]] + | cpp_code +sep ::= cpp_code | empty_string +else_branch ::= $else [[ code ]] + | $elif exp [[ code ]] else_branch + | empty_string +exp ::= simple_expression_in_Python_syntax +``` + +## Code + +You can find the source code of Pump in [scripts/pump.py](../scripts/pump.py). +It is still very unpolished and lacks automated tests, although it has been +successfully used many times. If you find a chance to use it in your project, +please let us know what you think! We also welcome help on improving Pump. + +## Real Examples + +You can find real-world applications of Pump in +[Google Test](https://github.com/google/googletest/tree/master/googletest) and +[Google Mock](https://github.com/google/googletest/tree/master/googlemock). The +source file `foo.h.pump` generates `foo.h`. + +## Tips + +* If a meta variable is followed by a letter or digit, you can separate them + using `[[]]`, which inserts an empty string. For example `Foo$j[[]]Helper` + generate `Foo1Helper` when `j` is 1. +* To avoid extra-long Pump source lines, you can break a line anywhere you + want by inserting `[[]]` followed by a new line. Since any new-line + character next to `[[` or `]]` is ignored, the generated code won't contain + this new line. diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-actions.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-actions.h new file mode 100644 index 0000000000..615651b342 --- /dev/null +++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-actions.h @@ -0,0 +1,1567 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Google Mock - a framework for writing C++ mock classes. +// +// The ACTION* family of macros can be used in a namespace scope to +// define custom actions easily. The syntax: +// +// ACTION(name) { statements; } +// +// will define an action with the given name that executes the +// statements. The value returned by the statements will be used as +// the return value of the action. Inside the statements, you can +// refer to the K-th (0-based) argument of the mock function by +// 'argK', and refer to its type by 'argK_type'. For example: +// +// ACTION(IncrementArg1) { +// arg1_type temp = arg1; +// return ++(*temp); +// } +// +// allows you to write +// +// ...WillOnce(IncrementArg1()); +// +// You can also refer to the entire argument tuple and its type by +// 'args' and 'args_type', and refer to the mock function type and its +// return type by 'function_type' and 'return_type'. +// +// Note that you don't need to specify the types of the mock function +// arguments. However rest assured that your code is still type-safe: +// you'll get a compiler error if *arg1 doesn't support the ++ +// operator, or if the type of ++(*arg1) isn't compatible with the +// mock function's return type, for example. +// +// Sometimes you'll want to parameterize the action. For that you can use +// another macro: +// +// ACTION_P(name, param_name) { statements; } +// +// For example: +// +// ACTION_P(Add, n) { return arg0 + n; } +// +// will allow you to write: +// +// ...WillOnce(Add(5)); +// +// Note that you don't need to provide the type of the parameter +// either. If you need to reference the type of a parameter named +// 'foo', you can write 'foo_type'. For example, in the body of +// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type +// of 'n'. +// +// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support +// multi-parameter actions. +// +// For the purpose of typing, you can view +// +// ACTION_Pk(Foo, p1, ..., pk) { ... } +// +// as shorthand for +// +// template +// FooActionPk Foo(p1_type p1, ..., pk_type pk) { ... } +// +// In particular, you can provide the template type arguments +// explicitly when invoking Foo(), as in Foo(5, false); +// although usually you can rely on the compiler to infer the types +// for you automatically. You can assign the result of expression +// Foo(p1, ..., pk) to a variable of type FooActionPk. This can be useful when composing actions. +// +// You can also overload actions with different numbers of parameters: +// +// ACTION_P(Plus, a) { ... } +// ACTION_P2(Plus, a, b) { ... } +// +// While it's tempting to always use the ACTION* macros when defining +// a new action, you should also consider implementing ActionInterface +// or using MakePolymorphicAction() instead, especially if you need to +// use the action a lot. While these approaches require more work, +// they give you more control on the types of the mock function +// arguments and the action parameters, which in general leads to +// better compiler error messages that pay off in the long run. They +// also allow overloading actions based on parameter types (as opposed +// to just based on the number of parameters). +// +// CAVEAT: +// +// ACTION*() can only be used in a namespace scope as templates cannot be +// declared inside of a local class. +// Users can, however, define any local functors (e.g. a lambda) that +// can be used as actions. +// +// MORE INFORMATION: +// +// To learn more about using these macros, please search for 'ACTION' on +// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md + +// GOOGLETEST_CM0002 DO NOT DELETE + +#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_ +#define GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_ + +#ifndef _WIN32_WCE +# include +#endif + +#include +#include +#include +#include +#include +#include + +#include "gmock/internal/gmock-internal-utils.h" +#include "gmock/internal/gmock-port.h" +#include "gmock/internal/gmock-pp.h" + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4100) +#endif + +namespace testing { + +// To implement an action Foo, define: +// 1. a class FooAction that implements the ActionInterface interface, and +// 2. a factory function that creates an Action object from a +// const FooAction*. +// +// The two-level delegation design follows that of Matcher, providing +// consistency for extension developers. It also eases ownership +// management as Action objects can now be copied like plain values. + +namespace internal { + +// BuiltInDefaultValueGetter::Get() returns a +// default-constructed T value. BuiltInDefaultValueGetter::Get() crashes with an error. +// +// This primary template is used when kDefaultConstructible is true. +template +struct BuiltInDefaultValueGetter { + static T Get() { return T(); } +}; +template +struct BuiltInDefaultValueGetter { + static T Get() { + Assert(false, __FILE__, __LINE__, + "Default action undefined for the function return type."); + return internal::Invalid(); + // The above statement will never be reached, but is required in + // order for this function to compile. + } +}; + +// BuiltInDefaultValue::Get() returns the "built-in" default value +// for type T, which is NULL when T is a raw pointer type, 0 when T is +// a numeric type, false when T is bool, or "" when T is string or +// std::string. In addition, in C++11 and above, it turns a +// default-constructed T value if T is default constructible. For any +// other type T, the built-in default T value is undefined, and the +// function will abort the process. +template +class BuiltInDefaultValue { + public: + // This function returns true if and only if type T has a built-in default + // value. + static bool Exists() { + return ::std::is_default_constructible::value; + } + + static T Get() { + return BuiltInDefaultValueGetter< + T, ::std::is_default_constructible::value>::Get(); + } +}; + +// This partial specialization says that we use the same built-in +// default value for T and const T. +template +class BuiltInDefaultValue { + public: + static bool Exists() { return BuiltInDefaultValue::Exists(); } + static T Get() { return BuiltInDefaultValue::Get(); } +}; + +// This partial specialization defines the default values for pointer +// types. +template +class BuiltInDefaultValue { + public: + static bool Exists() { return true; } + static T* Get() { return nullptr; } +}; + +// The following specializations define the default values for +// specific types we care about. +#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \ + template <> \ + class BuiltInDefaultValue { \ + public: \ + static bool Exists() { return true; } \ + static type Get() { return value; } \ + } + +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, ); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, ""); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0'); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0'); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0'); + +// There's no need for a default action for signed wchar_t, as that +// type is the same as wchar_t for gcc, and invalid for MSVC. +// +// There's also no need for a default action for unsigned wchar_t, as +// that type is the same as unsigned int for gcc, and invalid for +// MSVC. +#if GMOCK_WCHAR_T_IS_NATIVE_ +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U); // NOLINT +#endif + +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long long, 0); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0); // NOLINT +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0); +GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0); + +#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_ + +// Simple two-arg form of std::disjunction. +template +using disjunction = typename ::std::conditional::type; + +} // namespace internal + +// When an unexpected function call is encountered, Google Mock will +// let it return a default value if the user has specified one for its +// return type, or if the return type has a built-in default value; +// otherwise Google Mock won't know what value to return and will have +// to abort the process. +// +// The DefaultValue class allows a user to specify the +// default value for a type T that is both copyable and publicly +// destructible (i.e. anything that can be used as a function return +// type). The usage is: +// +// // Sets the default value for type T to be foo. +// DefaultValue::Set(foo); +template +class DefaultValue { + public: + // Sets the default value for type T; requires T to be + // copy-constructable and have a public destructor. + static void Set(T x) { + delete producer_; + producer_ = new FixedValueProducer(x); + } + + // Provides a factory function to be called to generate the default value. + // This method can be used even if T is only move-constructible, but it is not + // limited to that case. + typedef T (*FactoryFunction)(); + static void SetFactory(FactoryFunction factory) { + delete producer_; + producer_ = new FactoryValueProducer(factory); + } + + // Unsets the default value for type T. + static void Clear() { + delete producer_; + producer_ = nullptr; + } + + // Returns true if and only if the user has set the default value for type T. + static bool IsSet() { return producer_ != nullptr; } + + // Returns true if T has a default return value set by the user or there + // exists a built-in default value. + static bool Exists() { + return IsSet() || internal::BuiltInDefaultValue::Exists(); + } + + // Returns the default value for type T if the user has set one; + // otherwise returns the built-in default value. Requires that Exists() + // is true, which ensures that the return value is well-defined. + static T Get() { + return producer_ == nullptr ? internal::BuiltInDefaultValue::Get() + : producer_->Produce(); + } + + private: + class ValueProducer { + public: + virtual ~ValueProducer() {} + virtual T Produce() = 0; + }; + + class FixedValueProducer : public ValueProducer { + public: + explicit FixedValueProducer(T value) : value_(value) {} + T Produce() override { return value_; } + + private: + const T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(FixedValueProducer); + }; + + class FactoryValueProducer : public ValueProducer { + public: + explicit FactoryValueProducer(FactoryFunction factory) + : factory_(factory) {} + T Produce() override { return factory_(); } + + private: + const FactoryFunction factory_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(FactoryValueProducer); + }; + + static ValueProducer* producer_; +}; + +// This partial specialization allows a user to set default values for +// reference types. +template +class DefaultValue { + public: + // Sets the default value for type T&. + static void Set(T& x) { // NOLINT + address_ = &x; + } + + // Unsets the default value for type T&. + static void Clear() { address_ = nullptr; } + + // Returns true if and only if the user has set the default value for type T&. + static bool IsSet() { return address_ != nullptr; } + + // Returns true if T has a default return value set by the user or there + // exists a built-in default value. + static bool Exists() { + return IsSet() || internal::BuiltInDefaultValue::Exists(); + } + + // Returns the default value for type T& if the user has set one; + // otherwise returns the built-in default value if there is one; + // otherwise aborts the process. + static T& Get() { + return address_ == nullptr ? internal::BuiltInDefaultValue::Get() + : *address_; + } + + private: + static T* address_; +}; + +// This specialization allows DefaultValue::Get() to +// compile. +template <> +class DefaultValue { + public: + static bool Exists() { return true; } + static void Get() {} +}; + +// Points to the user-set default value for type T. +template +typename DefaultValue::ValueProducer* DefaultValue::producer_ = nullptr; + +// Points to the user-set default value for type T&. +template +T* DefaultValue::address_ = nullptr; + +// Implement this interface to define an action for function type F. +template +class ActionInterface { + public: + typedef typename internal::Function::Result Result; + typedef typename internal::Function::ArgumentTuple ArgumentTuple; + + ActionInterface() {} + virtual ~ActionInterface() {} + + // Performs the action. This method is not const, as in general an + // action can have side effects and be stateful. For example, a + // get-the-next-element-from-the-collection action will need to + // remember the current element. + virtual Result Perform(const ArgumentTuple& args) = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionInterface); +}; + +// An Action is a copyable and IMMUTABLE (except by assignment) +// object that represents an action to be taken when a mock function +// of type F is called. The implementation of Action is just a +// std::shared_ptr to const ActionInterface. Don't inherit from Action! +// You can view an object implementing ActionInterface as a +// concrete action (including its current state), and an Action +// object as a handle to it. +template +class Action { + // Adapter class to allow constructing Action from a legacy ActionInterface. + // New code should create Actions from functors instead. + struct ActionAdapter { + // Adapter must be copyable to satisfy std::function requirements. + ::std::shared_ptr> impl_; + + template + typename internal::Function::Result operator()(Args&&... args) { + return impl_->Perform( + ::std::forward_as_tuple(::std::forward(args)...)); + } + }; + + public: + typedef typename internal::Function::Result Result; + typedef typename internal::Function::ArgumentTuple ArgumentTuple; + + // Constructs a null Action. Needed for storing Action objects in + // STL containers. + Action() {} + + // Construct an Action from a specified callable. + // This cannot take std::function directly, because then Action would not be + // directly constructible from lambda (it would require two conversions). + template , G>, + typename IsNoArgsFunctor = + ::std::is_constructible<::std::function, G>, + typename = typename ::std::enable_if::value>::type> + Action(G&& fun) { // NOLINT + Init(::std::forward(fun), IsCompatibleFunctor()); + } + + // Constructs an Action from its implementation. + explicit Action(ActionInterface* impl) + : fun_(ActionAdapter{::std::shared_ptr>(impl)}) {} + + // This constructor allows us to turn an Action object into an + // Action, as long as F's arguments can be implicitly converted + // to Func's and Func's return type can be implicitly converted to F's. + template + explicit Action(const Action& action) : fun_(action.fun_) {} + + // Returns true if and only if this is the DoDefault() action. + bool IsDoDefault() const { return fun_ == nullptr; } + + // Performs the action. Note that this method is const even though + // the corresponding method in ActionInterface is not. The reason + // is that a const Action means that it cannot be re-bound to + // another concrete action, not that the concrete action it binds to + // cannot change state. (Think of the difference between a const + // pointer and a pointer to const.) + Result Perform(ArgumentTuple args) const { + if (IsDoDefault()) { + internal::IllegalDoDefault(__FILE__, __LINE__); + } + return internal::Apply(fun_, ::std::move(args)); + } + + private: + template + friend class Action; + + template + void Init(G&& g, ::std::true_type) { + fun_ = ::std::forward(g); + } + + template + void Init(G&& g, ::std::false_type) { + fun_ = IgnoreArgs::type>{::std::forward(g)}; + } + + template + struct IgnoreArgs { + template + Result operator()(const Args&...) const { + return function_impl(); + } + + FunctionImpl function_impl; + }; + + // fun_ is an empty function if and only if this is the DoDefault() action. + ::std::function fun_; +}; + +// The PolymorphicAction class template makes it easy to implement a +// polymorphic action (i.e. an action that can be used in mock +// functions of than one type, e.g. Return()). +// +// To define a polymorphic action, a user first provides a COPYABLE +// implementation class that has a Perform() method template: +// +// class FooAction { +// public: +// template +// Result Perform(const ArgumentTuple& args) const { +// // Processes the arguments and returns a result, using +// // std::get(args) to get the N-th (0-based) argument in the tuple. +// } +// ... +// }; +// +// Then the user creates the polymorphic action using +// MakePolymorphicAction(object) where object has type FooAction. See +// the definition of Return(void) and SetArgumentPointee(value) for +// complete examples. +template +class PolymorphicAction { + public: + explicit PolymorphicAction(const Impl& impl) : impl_(impl) {} + + template + operator Action() const { + return Action(new MonomorphicImpl(impl_)); + } + + private: + template + class MonomorphicImpl : public ActionInterface { + public: + typedef typename internal::Function::Result Result; + typedef typename internal::Function::ArgumentTuple ArgumentTuple; + + explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {} + + Result Perform(const ArgumentTuple& args) override { + return impl_.template Perform(args); + } + + private: + Impl impl_; + + GTEST_DISALLOW_ASSIGN_(MonomorphicImpl); + }; + + Impl impl_; + + GTEST_DISALLOW_ASSIGN_(PolymorphicAction); +}; + +// Creates an Action from its implementation and returns it. The +// created Action object owns the implementation. +template +Action MakeAction(ActionInterface* impl) { + return Action(impl); +} + +// Creates a polymorphic action from its implementation. This is +// easier to use than the PolymorphicAction constructor as it +// doesn't require you to explicitly write the template argument, e.g. +// +// MakePolymorphicAction(foo); +// vs +// PolymorphicAction(foo); +template +inline PolymorphicAction MakePolymorphicAction(const Impl& impl) { + return PolymorphicAction(impl); +} + +namespace internal { + +// Helper struct to specialize ReturnAction to execute a move instead of a copy +// on return. Useful for move-only types, but could be used on any type. +template +struct ByMoveWrapper { + explicit ByMoveWrapper(T value) : payload(std::move(value)) {} + T payload; +}; + +// Implements the polymorphic Return(x) action, which can be used in +// any function that returns the type of x, regardless of the argument +// types. +// +// Note: The value passed into Return must be converted into +// Function::Result when this action is cast to Action rather than +// when that action is performed. This is important in scenarios like +// +// MOCK_METHOD1(Method, T(U)); +// ... +// { +// Foo foo; +// X x(&foo); +// EXPECT_CALL(mock, Method(_)).WillOnce(Return(x)); +// } +// +// In the example above the variable x holds reference to foo which leaves +// scope and gets destroyed. If copying X just copies a reference to foo, +// that copy will be left with a hanging reference. If conversion to T +// makes a copy of foo, the above code is safe. To support that scenario, we +// need to make sure that the type conversion happens inside the EXPECT_CALL +// statement, and conversion of the result of Return to Action is a +// good place for that. +// +// The real life example of the above scenario happens when an invocation +// of gtl::Container() is passed into Return. +// +template +class ReturnAction { + public: + // Constructs a ReturnAction object from the value to be returned. + // 'value' is passed by value instead of by const reference in order + // to allow Return("string literal") to compile. + explicit ReturnAction(R value) : value_(new R(std::move(value))) {} + + // This template type conversion operator allows Return(x) to be + // used in ANY function that returns x's type. + template + operator Action() const { // NOLINT + // Assert statement belongs here because this is the best place to verify + // conditions on F. It produces the clearest error messages + // in most compilers. + // Impl really belongs in this scope as a local class but can't + // because MSVC produces duplicate symbols in different translation units + // in this case. Until MS fixes that bug we put Impl into the class scope + // and put the typedef both here (for use in assert statement) and + // in the Impl class. But both definitions must be the same. + typedef typename Function::Result Result; + GTEST_COMPILE_ASSERT_( + !std::is_reference::value, + use_ReturnRef_instead_of_Return_to_return_a_reference); + static_assert(!std::is_void::value, + "Can't use Return() on an action expected to return `void`."); + return Action(new Impl(value_)); + } + + private: + // Implements the Return(x) action for a particular function type F. + template + class Impl : public ActionInterface { + public: + typedef typename Function::Result Result; + typedef typename Function::ArgumentTuple ArgumentTuple; + + // The implicit cast is necessary when Result has more than one + // single-argument constructor (e.g. Result is std::vector) and R + // has a type conversion operator template. In that case, value_(value) + // won't compile as the compiler doesn't known which constructor of + // Result to call. ImplicitCast_ forces the compiler to convert R to + // Result without considering explicit constructors, thus resolving the + // ambiguity. value_ is then initialized using its copy constructor. + explicit Impl(const std::shared_ptr& value) + : value_before_cast_(*value), + value_(ImplicitCast_(value_before_cast_)) {} + + Result Perform(const ArgumentTuple&) override { return value_; } + + private: + GTEST_COMPILE_ASSERT_(!std::is_reference::value, + Result_cannot_be_a_reference_type); + // We save the value before casting just in case it is being cast to a + // wrapper type. + R value_before_cast_; + Result value_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl); + }; + + // Partially specialize for ByMoveWrapper. This version of ReturnAction will + // move its contents instead. + template + class Impl, F> : public ActionInterface { + public: + typedef typename Function::Result Result; + typedef typename Function::ArgumentTuple ArgumentTuple; + + explicit Impl(const std::shared_ptr& wrapper) + : performed_(false), wrapper_(wrapper) {} + + Result Perform(const ArgumentTuple&) override { + GTEST_CHECK_(!performed_) + << "A ByMove() action should only be performed once."; + performed_ = true; + return std::move(wrapper_->payload); + } + + private: + bool performed_; + const std::shared_ptr wrapper_; + + GTEST_DISALLOW_ASSIGN_(Impl); + }; + + const std::shared_ptr value_; + + GTEST_DISALLOW_ASSIGN_(ReturnAction); +}; + +// Implements the ReturnNull() action. +class ReturnNullAction { + public: + // Allows ReturnNull() to be used in any pointer-returning function. In C++11 + // this is enforced by returning nullptr, and in non-C++11 by asserting a + // pointer type on compile time. + template + static Result Perform(const ArgumentTuple&) { + return nullptr; + } +}; + +// Implements the Return() action. +class ReturnVoidAction { + public: + // Allows Return() to be used in any void-returning function. + template + static void Perform(const ArgumentTuple&) { + static_assert(std::is_void::value, "Result should be void."); + } +}; + +// Implements the polymorphic ReturnRef(x) action, which can be used +// in any function that returns a reference to the type of x, +// regardless of the argument types. +template +class ReturnRefAction { + public: + // Constructs a ReturnRefAction object from the reference to be returned. + explicit ReturnRefAction(T& ref) : ref_(ref) {} // NOLINT + + // This template type conversion operator allows ReturnRef(x) to be + // used in ANY function that returns a reference to x's type. + template + operator Action() const { + typedef typename Function::Result Result; + // Asserts that the function return type is a reference. This + // catches the user error of using ReturnRef(x) when Return(x) + // should be used, and generates some helpful error message. + GTEST_COMPILE_ASSERT_(std::is_reference::value, + use_Return_instead_of_ReturnRef_to_return_a_value); + return Action(new Impl(ref_)); + } + + private: + // Implements the ReturnRef(x) action for a particular function type F. + template + class Impl : public ActionInterface { + public: + typedef typename Function::Result Result; + typedef typename Function::ArgumentTuple ArgumentTuple; + + explicit Impl(T& ref) : ref_(ref) {} // NOLINT + + Result Perform(const ArgumentTuple&) override { return ref_; } + + private: + T& ref_; + + GTEST_DISALLOW_ASSIGN_(Impl); + }; + + T& ref_; + + GTEST_DISALLOW_ASSIGN_(ReturnRefAction); +}; + +// Implements the polymorphic ReturnRefOfCopy(x) action, which can be +// used in any function that returns a reference to the type of x, +// regardless of the argument types. +template +class ReturnRefOfCopyAction { + public: + // Constructs a ReturnRefOfCopyAction object from the reference to + // be returned. + explicit ReturnRefOfCopyAction(const T& value) : value_(value) {} // NOLINT + + // This template type conversion operator allows ReturnRefOfCopy(x) to be + // used in ANY function that returns a reference to x's type. + template + operator Action() const { + typedef typename Function::Result Result; + // Asserts that the function return type is a reference. This + // catches the user error of using ReturnRefOfCopy(x) when Return(x) + // should be used, and generates some helpful error message. + GTEST_COMPILE_ASSERT_( + std::is_reference::value, + use_Return_instead_of_ReturnRefOfCopy_to_return_a_value); + return Action(new Impl(value_)); + } + + private: + // Implements the ReturnRefOfCopy(x) action for a particular function type F. + template + class Impl : public ActionInterface { + public: + typedef typename Function::Result Result; + typedef typename Function::ArgumentTuple ArgumentTuple; + + explicit Impl(const T& value) : value_(value) {} // NOLINT + + Result Perform(const ArgumentTuple&) override { return value_; } + + private: + T value_; + + GTEST_DISALLOW_ASSIGN_(Impl); + }; + + const T value_; + + GTEST_DISALLOW_ASSIGN_(ReturnRefOfCopyAction); +}; + +// Implements the polymorphic ReturnRoundRobin(v) action, which can be +// used in any function that returns the element_type of v. +template +class ReturnRoundRobinAction { + public: + explicit ReturnRoundRobinAction(std::vector values) { + GTEST_CHECK_(!values.empty()) + << "ReturnRoundRobin requires at least one element."; + state_->values = std::move(values); + } + + template + T operator()(Args&&...) const { + return state_->Next(); + } + + private: + struct State { + T Next() { + T ret_val = values[i++]; + if (i == values.size()) i = 0; + return ret_val; + } + + std::vector values; + size_t i = 0; + }; + std::shared_ptr state_ = std::make_shared(); +}; + +// Implements the polymorphic DoDefault() action. +class DoDefaultAction { + public: + // This template type conversion operator allows DoDefault() to be + // used in any function. + template + operator Action() const { return Action(); } // NOLINT +}; + +// Implements the Assign action to set a given pointer referent to a +// particular value. +template +class AssignAction { + public: + AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {} + + template + void Perform(const ArgumentTuple& /* args */) const { + *ptr_ = value_; + } + + private: + T1* const ptr_; + const T2 value_; + + GTEST_DISALLOW_ASSIGN_(AssignAction); +}; + +#if !GTEST_OS_WINDOWS_MOBILE + +// Implements the SetErrnoAndReturn action to simulate return from +// various system calls and libc functions. +template +class SetErrnoAndReturnAction { + public: + SetErrnoAndReturnAction(int errno_value, T result) + : errno_(errno_value), + result_(result) {} + template + Result Perform(const ArgumentTuple& /* args */) const { + errno = errno_; + return result_; + } + + private: + const int errno_; + const T result_; + + GTEST_DISALLOW_ASSIGN_(SetErrnoAndReturnAction); +}; + +#endif // !GTEST_OS_WINDOWS_MOBILE + +// Implements the SetArgumentPointee(x) action for any function +// whose N-th argument (0-based) is a pointer to x's type. +template +struct SetArgumentPointeeAction { + A value; + + template + void operator()(const Args&... args) const { + *::std::get(std::tie(args...)) = value; + } +}; + +// Implements the Invoke(object_ptr, &Class::Method) action. +template +struct InvokeMethodAction { + Class* const obj_ptr; + const MethodPtr method_ptr; + + template + auto operator()(Args&&... args) const + -> decltype((obj_ptr->*method_ptr)(std::forward(args)...)) { + return (obj_ptr->*method_ptr)(std::forward(args)...); + } +}; + +// Implements the InvokeWithoutArgs(f) action. The template argument +// FunctionImpl is the implementation type of f, which can be either a +// function pointer or a functor. InvokeWithoutArgs(f) can be used as an +// Action as long as f's type is compatible with F. +template +struct InvokeWithoutArgsAction { + FunctionImpl function_impl; + + // Allows InvokeWithoutArgs(f) to be used as any action whose type is + // compatible with f. + template + auto operator()(const Args&...) -> decltype(function_impl()) { + return function_impl(); + } +}; + +// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action. +template +struct InvokeMethodWithoutArgsAction { + Class* const obj_ptr; + const MethodPtr method_ptr; + + using ReturnType = + decltype((std::declval()->*std::declval())()); + + template + ReturnType operator()(const Args&...) const { + return (obj_ptr->*method_ptr)(); + } +}; + +// Implements the IgnoreResult(action) action. +template +class IgnoreResultAction { + public: + explicit IgnoreResultAction(const A& action) : action_(action) {} + + template + operator Action() const { + // Assert statement belongs here because this is the best place to verify + // conditions on F. It produces the clearest error messages + // in most compilers. + // Impl really belongs in this scope as a local class but can't + // because MSVC produces duplicate symbols in different translation units + // in this case. Until MS fixes that bug we put Impl into the class scope + // and put the typedef both here (for use in assert statement) and + // in the Impl class. But both definitions must be the same. + typedef typename internal::Function::Result Result; + + // Asserts at compile time that F returns void. + static_assert(std::is_void::value, "Result type should be void."); + + return Action(new Impl(action_)); + } + + private: + template + class Impl : public ActionInterface { + public: + typedef typename internal::Function::Result Result; + typedef typename internal::Function::ArgumentTuple ArgumentTuple; + + explicit Impl(const A& action) : action_(action) {} + + void Perform(const ArgumentTuple& args) override { + // Performs the action and ignores its result. + action_.Perform(args); + } + + private: + // Type OriginalFunction is the same as F except that its return + // type is IgnoredValue. + typedef typename internal::Function::MakeResultIgnoredValue + OriginalFunction; + + const Action action_; + + GTEST_DISALLOW_ASSIGN_(Impl); + }; + + const A action_; + + GTEST_DISALLOW_ASSIGN_(IgnoreResultAction); +}; + +template +struct WithArgsAction { + InnerAction action; + + // The inner action could be anything convertible to Action. + // We use the conversion operator to detect the signature of the inner Action. + template + operator Action() const { // NOLINT + using TupleType = std::tuple; + Action::type...)> + converted(action); + + return [converted](Args... args) -> R { + return converted.Perform(std::forward_as_tuple( + std::get(std::forward_as_tuple(std::forward(args)...))...)); + }; + } +}; + +template +struct DoAllAction { + private: + template + std::vector> Convert(IndexSequence) const { + return {std::get(actions)...}; + } + + public: + std::tuple actions; + + template + operator Action() const { // NOLINT + struct Op { + std::vector> converted; + Action last; + R operator()(Args... args) const { + auto tuple_args = std::forward_as_tuple(std::forward(args)...); + for (auto& a : converted) { + a.Perform(tuple_args); + } + return last.Perform(tuple_args); + } + }; + return Op{Convert(MakeIndexSequence()), + std::get(actions)}; + } +}; + +} // namespace internal + +// An Unused object can be implicitly constructed from ANY value. +// This is handy when defining actions that ignore some or all of the +// mock function arguments. For example, given +// +// MOCK_METHOD3(Foo, double(const string& label, double x, double y)); +// MOCK_METHOD3(Bar, double(int index, double x, double y)); +// +// instead of +// +// double DistanceToOriginWithLabel(const string& label, double x, double y) { +// return sqrt(x*x + y*y); +// } +// double DistanceToOriginWithIndex(int index, double x, double y) { +// return sqrt(x*x + y*y); +// } +// ... +// EXPECT_CALL(mock, Foo("abc", _, _)) +// .WillOnce(Invoke(DistanceToOriginWithLabel)); +// EXPECT_CALL(mock, Bar(5, _, _)) +// .WillOnce(Invoke(DistanceToOriginWithIndex)); +// +// you could write +// +// // We can declare any uninteresting argument as Unused. +// double DistanceToOrigin(Unused, double x, double y) { +// return sqrt(x*x + y*y); +// } +// ... +// EXPECT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin)); +// EXPECT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin)); +typedef internal::IgnoredValue Unused; + +// Creates an action that does actions a1, a2, ..., sequentially in +// each invocation. +template +internal::DoAllAction::type...> DoAll( + Action&&... action) { + return {std::forward_as_tuple(std::forward(action)...)}; +} + +// WithArg(an_action) creates an action that passes the k-th +// (0-based) argument of the mock function to an_action and performs +// it. It adapts an action accepting one argument to one that accepts +// multiple arguments. For convenience, we also provide +// WithArgs(an_action) (defined below) as a synonym. +template +internal::WithArgsAction::type, k> +WithArg(InnerAction&& action) { + return {std::forward(action)}; +} + +// WithArgs(an_action) creates an action that passes +// the selected arguments of the mock function to an_action and +// performs it. It serves as an adaptor between actions with +// different argument lists. +template +internal::WithArgsAction::type, k, ks...> +WithArgs(InnerAction&& action) { + return {std::forward(action)}; +} + +// WithoutArgs(inner_action) can be used in a mock function with a +// non-empty argument list to perform inner_action, which takes no +// argument. In other words, it adapts an action accepting no +// argument to one that accepts (and ignores) arguments. +template +internal::WithArgsAction::type> +WithoutArgs(InnerAction&& action) { + return {std::forward(action)}; +} + +// Creates an action that returns 'value'. 'value' is passed by value +// instead of const reference - otherwise Return("string literal") +// will trigger a compiler error about using array as initializer. +template +internal::ReturnAction Return(R value) { + return internal::ReturnAction(std::move(value)); +} + +// Creates an action that returns NULL. +inline PolymorphicAction ReturnNull() { + return MakePolymorphicAction(internal::ReturnNullAction()); +} + +// Creates an action that returns from a void function. +inline PolymorphicAction Return() { + return MakePolymorphicAction(internal::ReturnVoidAction()); +} + +// Creates an action that returns the reference to a variable. +template +inline internal::ReturnRefAction ReturnRef(R& x) { // NOLINT + return internal::ReturnRefAction(x); +} + +// Prevent using ReturnRef on reference to temporary. +template +internal::ReturnRefAction ReturnRef(R&&) = delete; + +// Creates an action that returns the reference to a copy of the +// argument. The copy is created when the action is constructed and +// lives as long as the action. +template +inline internal::ReturnRefOfCopyAction ReturnRefOfCopy(const R& x) { + return internal::ReturnRefOfCopyAction(x); +} + +// Modifies the parent action (a Return() action) to perform a move of the +// argument instead of a copy. +// Return(ByMove()) actions can only be executed once and will assert this +// invariant. +template +internal::ByMoveWrapper ByMove(R x) { + return internal::ByMoveWrapper(std::move(x)); +} + +// Creates an action that returns an element of `vals`. Calling this action will +// repeatedly return the next value from `vals` until it reaches the end and +// will restart from the beginning. +template +internal::ReturnRoundRobinAction ReturnRoundRobin(std::vector vals) { + return internal::ReturnRoundRobinAction(std::move(vals)); +} + +// Creates an action that returns an element of `vals`. Calling this action will +// repeatedly return the next value from `vals` until it reaches the end and +// will restart from the beginning. +template +internal::ReturnRoundRobinAction ReturnRoundRobin( + std::initializer_list vals) { + return internal::ReturnRoundRobinAction(std::vector(vals)); +} + +// Creates an action that does the default action for the give mock function. +inline internal::DoDefaultAction DoDefault() { + return internal::DoDefaultAction(); +} + +// Creates an action that sets the variable pointed by the N-th +// (0-based) function argument to 'value'. +template +internal::SetArgumentPointeeAction SetArgPointee(T value) { + return {std::move(value)}; +} + +// The following version is DEPRECATED. +template +internal::SetArgumentPointeeAction SetArgumentPointee(T value) { + return {std::move(value)}; +} + +// Creates an action that sets a pointer referent to a given value. +template +PolymorphicAction > Assign(T1* ptr, T2 val) { + return MakePolymorphicAction(internal::AssignAction(ptr, val)); +} + +#if !GTEST_OS_WINDOWS_MOBILE + +// Creates an action that sets errno and returns the appropriate error. +template +PolymorphicAction > +SetErrnoAndReturn(int errval, T result) { + return MakePolymorphicAction( + internal::SetErrnoAndReturnAction(errval, result)); +} + +#endif // !GTEST_OS_WINDOWS_MOBILE + +// Various overloads for Invoke(). + +// Legacy function. +// Actions can now be implicitly constructed from callables. No need to create +// wrapper objects. +// This function exists for backwards compatibility. +template +typename std::decay::type Invoke(FunctionImpl&& function_impl) { + return std::forward(function_impl); +} + +// Creates an action that invokes the given method on the given object +// with the mock function's arguments. +template +internal::InvokeMethodAction Invoke(Class* obj_ptr, + MethodPtr method_ptr) { + return {obj_ptr, method_ptr}; +} + +// Creates an action that invokes 'function_impl' with no argument. +template +internal::InvokeWithoutArgsAction::type> +InvokeWithoutArgs(FunctionImpl function_impl) { + return {std::move(function_impl)}; +} + +// Creates an action that invokes the given method on the given object +// with no argument. +template +internal::InvokeMethodWithoutArgsAction InvokeWithoutArgs( + Class* obj_ptr, MethodPtr method_ptr) { + return {obj_ptr, method_ptr}; +} + +// Creates an action that performs an_action and throws away its +// result. In other words, it changes the return type of an_action to +// void. an_action MUST NOT return void, or the code won't compile. +template +inline internal::IgnoreResultAction IgnoreResult(const A& an_action) { + return internal::IgnoreResultAction(an_action); +} + +// Creates a reference wrapper for the given L-value. If necessary, +// you can explicitly specify the type of the reference. For example, +// suppose 'derived' is an object of type Derived, ByRef(derived) +// would wrap a Derived&. If you want to wrap a const Base& instead, +// where Base is a base class of Derived, just write: +// +// ByRef(derived) +// +// N.B. ByRef is redundant with std::ref, std::cref and std::reference_wrapper. +// However, it may still be used for consistency with ByMove(). +template +inline ::std::reference_wrapper ByRef(T& l_value) { // NOLINT + return ::std::reference_wrapper(l_value); +} + +namespace internal { + +// A macro from the ACTION* family (defined later in gmock-generated-actions.h) +// defines an action that can be used in a mock function. Typically, +// these actions only care about a subset of the arguments of the mock +// function. For example, if such an action only uses the second +// argument, it can be used in any mock function that takes >= 2 +// arguments where the type of the second argument is compatible. +// +// Therefore, the action implementation must be prepared to take more +// arguments than it needs. The ExcessiveArg type is used to +// represent those excessive arguments. In order to keep the compiler +// error messages tractable, we define it in the testing namespace +// instead of testing::internal. However, this is an INTERNAL TYPE +// and subject to change without notice, so a user MUST NOT USE THIS +// TYPE DIRECTLY. +struct ExcessiveArg {}; + +// A helper class needed for implementing the ACTION* macros. +template +class ActionHelper { + public: + template + static Result Perform(Impl* impl, const std::tuple& args) { + return Apply(impl, args, MakeIndexSequence{}, + MakeIndexSequence<10 - sizeof...(Ts)>{}); + } + + private: + template + static Result Apply(Impl* impl, const std::tuple& args, + IndexSequence, IndexSequence) { + return impl->template gmock_PerformImpl( + args, std::get(args)..., + ((void)rest_ids, ExcessiveArg())...); + } +}; + +// A helper base class needed for implementing the ACTION* macros. +// Implements constructor and conversion operator for Action. +// +// Template specialization for parameterless Action. +template +class ActionImpl { + public: + ActionImpl() = default; + + template + operator ::testing::Action() const { // NOLINT(runtime/explicit) + return ::testing::Action(new typename Derived::template gmock_Impl()); + } +}; + +// Template specialization for parameterized Action. +template